LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true,
4456 bool AllowAVX512 = true) {
4457 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4458 unsigned NumSubs = 1;
4459 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4460 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4461 if (VT.getSizeInBits() > 512) {
4462 NumSubs = VT.getSizeInBits() / 512;
4463 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4464 }
4465 } else if (Subtarget.hasAVX2()) {
4466 if (VT.getSizeInBits() > 256) {
4467 NumSubs = VT.getSizeInBits() / 256;
4468 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4469 }
4470 } else {
4471 if (VT.getSizeInBits() > 128) {
4472 NumSubs = VT.getSizeInBits() / 128;
4473 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4474 }
4475 }
4476
4477 if (NumSubs == 1)
4478 return Builder(DAG, DL, Ops);
4479
4481 for (unsigned i = 0; i != NumSubs; ++i) {
4483 for (SDValue Op : Ops) {
4484 EVT OpVT = Op.getValueType();
4485 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4486 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4487 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4488 }
4489 Subs.push_back(Builder(DAG, DL, SubOps));
4490 }
4491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4492}
4493
4494// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4495// targets.
4496static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4498 const X86Subtarget &Subtarget) {
4499 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4500 MVT SVT = VT.getScalarType();
4501
4502 // If we have a 32/64 splatted constant, splat it to DstTy to
4503 // encourage a foldable broadcast'd operand.
4504 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4505 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4506 // AVX512 broadcasts 32/64-bit operands.
4507 // TODO: Support float once getAVX512Node is used by fp-ops.
4508 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4510 return SDValue();
4511 // If we're not widening, don't bother if we're not bitcasting.
4512 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4513 return SDValue();
4515 APInt SplatValue, SplatUndef;
4516 unsigned SplatBitSize;
4517 bool HasAnyUndefs;
4518 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4519 HasAnyUndefs, OpEltSizeInBits) &&
4520 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4521 return DAG.getConstant(SplatValue, DL, DstVT);
4522 }
4523 return SDValue();
4524 };
4525
4526 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4527
4528 MVT DstVT = VT;
4529 if (Widen)
4530 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4531
4532 // Canonicalize src operands.
4533 SmallVector<SDValue> SrcOps(Ops);
4534 for (SDValue &Op : SrcOps) {
4535 MVT OpVT = Op.getSimpleValueType();
4536 // Just pass through scalar operands.
4537 if (!OpVT.isVector())
4538 continue;
4539 assert(OpVT == VT && "Vector type mismatch");
4540
4541 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4542 Op = BroadcastOp;
4543 continue;
4544 }
4545
4546 // Just widen the subvector by inserting into an undef wide vector.
4547 if (Widen)
4548 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4549 }
4550
4551 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4552
4553 // Perform the 512-bit op then extract the bottom subvector.
4554 if (Widen)
4555 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4556 return Res;
4557}
4558
4559/// Insert i1-subvector to i1-vector.
4561 const X86Subtarget &Subtarget) {
4562
4563 SDLoc dl(Op);
4564 SDValue Vec = Op.getOperand(0);
4565 SDValue SubVec = Op.getOperand(1);
4566 SDValue Idx = Op.getOperand(2);
4567 unsigned IdxVal = Op.getConstantOperandVal(2);
4568
4569 // Inserting undef is a nop. We can just return the original vector.
4570 if (SubVec.isUndef())
4571 return Vec;
4572
4573 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4574 return Op;
4575
4576 MVT OpVT = Op.getSimpleValueType();
4577 unsigned NumElems = OpVT.getVectorNumElements();
4578 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4579
4580 // Extend to natively supported kshift.
4581 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4582
4583 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4584 // if necessary.
4585 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4586 // May need to promote to a legal type.
4587 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4588 DAG.getConstant(0, dl, WideOpVT),
4589 SubVec, Idx);
4590 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4591 }
4592
4593 MVT SubVecVT = SubVec.getSimpleValueType();
4594 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4595 assert(IdxVal + SubVecNumElems <= NumElems &&
4596 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4597 "Unexpected index value in INSERT_SUBVECTOR");
4598
4599 SDValue Undef = DAG.getUNDEF(WideOpVT);
4600
4601 if (IdxVal == 0) {
4602 // Zero lower bits of the Vec
4603 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4605 ZeroIdx);
4606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4607 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4608 // Merge them together, SubVec should be zero extended.
4609 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4610 DAG.getConstant(0, dl, WideOpVT),
4611 SubVec, ZeroIdx);
4612 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4614 }
4615
4616 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4617 Undef, SubVec, ZeroIdx);
4618
4619 if (Vec.isUndef()) {
4620 assert(IdxVal != 0 && "Unexpected index");
4621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4624 }
4625
4627 assert(IdxVal != 0 && "Unexpected index");
4628 // If upper elements of Vec are known undef, then just shift into place.
4629 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4630 [](SDValue V) { return V.isUndef(); })) {
4631 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4633 } else {
4634 NumElems = WideOpVT.getVectorNumElements();
4635 unsigned ShiftLeft = NumElems - SubVecNumElems;
4636 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4639 if (ShiftRight != 0)
4640 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4642 }
4643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4644 }
4645
4646 // Simple case when we put subvector in the upper part
4647 if (IdxVal + SubVecNumElems == NumElems) {
4648 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4649 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4650 if (SubVecNumElems * 2 == NumElems) {
4651 // Special case, use legal zero extending insert_subvector. This allows
4652 // isel to optimize when bits are known zero.
4653 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4654 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4655 DAG.getConstant(0, dl, WideOpVT),
4656 Vec, ZeroIdx);
4657 } else {
4658 // Otherwise use explicit shifts to zero the bits.
4659 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4660 Undef, Vec, ZeroIdx);
4661 NumElems = WideOpVT.getVectorNumElements();
4662 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4663 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4664 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4665 }
4666 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4668 }
4669
4670 // Inserting into the middle is more complicated.
4671
4672 NumElems = WideOpVT.getVectorNumElements();
4673
4674 // Widen the vector if needed.
4675 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4676
4677 unsigned ShiftLeft = NumElems - SubVecNumElems;
4678 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4679
4680 // Do an optimization for the most frequently used types.
4681 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4682 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4683 Mask0.flipAllBits();
4684 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4685 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4686 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4687 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4688 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4689 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4690 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4691 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4692
4693 // Reduce to original width if needed.
4694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4695 }
4696
4697 // Clear the upper bits of the subvector and move it to its insert position.
4698 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4699 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4700 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4701 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4702
4703 // Isolate the bits below the insertion point.
4704 unsigned LowShift = NumElems - IdxVal;
4705 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4706 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4707 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4708 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4709
4710 // Isolate the bits after the last inserted bit.
4711 unsigned HighShift = IdxVal + SubVecNumElems;
4712 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4713 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4714 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4715 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4716
4717 // Now OR all 3 pieces together.
4718 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4719 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4720
4721 // Reduce to original width if needed.
4722 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4723}
4724
4726 const SDLoc &dl) {
4727 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4728 EVT SubVT = V1.getValueType();
4729 EVT SubSVT = SubVT.getScalarType();
4730 unsigned SubNumElts = SubVT.getVectorNumElements();
4731 unsigned SubVectorWidth = SubVT.getSizeInBits();
4732 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4733 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4734 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4735}
4736
4737/// Returns a vector of specified type with all bits set.
4738/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4739/// Then bitcast to their original type, ensuring they get CSE'd.
4740static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4741 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4742 "Expected a 128/256/512-bit vector type");
4743 unsigned NumElts = VT.getSizeInBits() / 32;
4744 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4745 return DAG.getBitcast(VT, Vec);
4746}
4747
4748static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4749 SDValue In, SelectionDAG &DAG) {
4750 EVT InVT = In.getValueType();
4751 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4752
4753 // Canonicalize Opcode to general extension version.
4754 switch (Opcode) {
4755 case ISD::ANY_EXTEND:
4757 Opcode = ISD::ANY_EXTEND;
4758 break;
4759 case ISD::SIGN_EXTEND:
4761 Opcode = ISD::SIGN_EXTEND;
4762 break;
4763 case ISD::ZERO_EXTEND:
4765 Opcode = ISD::ZERO_EXTEND;
4766 break;
4767 default:
4768 llvm_unreachable("Unknown extension opcode");
4769 }
4770
4771 // For 256-bit vectors, we only need the lower (128-bit) input half.
4772 // For 512-bit vectors, we only need the lower input half or quarter.
4773 if (InVT.getSizeInBits() > 128) {
4774 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4775 "Expected VTs to be the same size!");
4776 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4777 In = extractSubVector(In, 0, DAG, DL,
4778 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4779 InVT = In.getValueType();
4780 }
4781
4782 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4783 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4784
4785 return DAG.getNode(Opcode, DL, VT, In);
4786}
4787
4788// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4790 SDValue Mask, SelectionDAG &DAG) {
4791 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4792 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4793 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4794}
4795
4797 bool Lo, bool Unary) {
4798 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4799 "Illegal vector type to unpack");
4800 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4801 int NumElts = VT.getVectorNumElements();
4802 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4803 for (int i = 0; i < NumElts; ++i) {
4804 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4805 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4806 Pos += (Unary ? 0 : NumElts * (i % 2));
4807 Pos += (Lo ? 0 : NumEltsInLane / 2);
4808 Mask.push_back(Pos);
4809 }
4810}
4811
4812/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4813/// imposed by AVX and specific to the unary pattern. Example:
4814/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4815/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4817 bool Lo) {
4818 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4819 int NumElts = VT.getVectorNumElements();
4820 for (int i = 0; i < NumElts; ++i) {
4821 int Pos = i / 2;
4822 Pos += (Lo ? 0 : NumElts / 2);
4823 Mask.push_back(Pos);
4824 }
4825}
4826
4827// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4828static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4829 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4832 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4833 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4834 int M = Mask[I];
4835 if (M < 0)
4836 continue;
4837 SDValue V = (M < NumElts) ? V1 : V2;
4838 if (V.isUndef())
4839 continue;
4840 Ops[I] = V.getOperand(M % NumElts);
4841 }
4842 return DAG.getBuildVector(VT, dl, Ops);
4843 }
4844
4845 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4846}
4847
4848/// Returns a vector_shuffle node for an unpackl operation.
4849static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4850 SDValue V1, SDValue V2) {
4852 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4853 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4854}
4855
4856/// Returns a vector_shuffle node for an unpackh operation.
4857static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4858 SDValue V1, SDValue V2) {
4860 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4861 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4862}
4863
4864/// Returns a node that packs the LHS + RHS nodes together at half width.
4865/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4866/// TODO: Add subvector splitting if/when we have a need for it.
4867static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4868 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4869 bool PackHiHalf = false) {
4870 MVT OpVT = LHS.getSimpleValueType();
4871 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4872 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4873 assert(OpVT == RHS.getSimpleValueType() &&
4874 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4875 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4876 "Unexpected PACK operand types");
4877 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4878 "Unexpected PACK result type");
4879
4880 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4881 if (EltSizeInBits == 32) {
4882 SmallVector<int> PackMask;
4883 int Offset = PackHiHalf ? 1 : 0;
4884 int NumElts = VT.getVectorNumElements();
4885 for (int I = 0; I != NumElts; I += 4) {
4886 PackMask.push_back(I + Offset);
4887 PackMask.push_back(I + Offset + 2);
4888 PackMask.push_back(I + Offset + NumElts);
4889 PackMask.push_back(I + Offset + NumElts + 2);
4890 }
4891 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4892 DAG.getBitcast(VT, RHS), PackMask);
4893 }
4894
4895 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4896 if (!PackHiHalf) {
4897 if (UsePackUS &&
4898 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4899 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4901
4902 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4903 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4904 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4905 }
4906
4907 // Fallback to sign/zero extending the requested half and pack.
4908 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4909 if (UsePackUS) {
4910 if (PackHiHalf) {
4911 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4912 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4913 } else {
4914 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4915 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4916 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4917 };
4918 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4919 };
4920
4921 if (!PackHiHalf) {
4922 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4924 }
4925 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4927 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4928}
4929
4930/// Return a vector_shuffle of the specified vector of zero or undef vector.
4931/// This produces a shuffle where the low element of V2 is swizzled into the
4932/// zero/undef vector, landing at element Idx.
4933/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4935 bool IsZero,
4936 const X86Subtarget &Subtarget,
4937 SelectionDAG &DAG) {
4938 MVT VT = V2.getSimpleValueType();
4939 SDValue V1 = IsZero
4940 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4941 int NumElems = VT.getVectorNumElements();
4942 SmallVector<int, 16> MaskVec(NumElems);
4943 for (int i = 0; i != NumElems; ++i)
4944 // If this is the insertion idx, put the low elt of V2 here.
4945 MaskVec[i] = (i == Idx) ? NumElems : i;
4946 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4947}
4948
4950 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4951 Ptr.getOpcode() == X86ISD::WrapperRIP)
4952 Ptr = Ptr.getOperand(0);
4954}
4955
4956// TODO: Add support for non-zero offsets.
4959 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4960 return nullptr;
4961 return CNode->getConstVal();
4962}
4963
4965 if (!Load || !ISD::isNormalLoad(Load))
4966 return nullptr;
4967 return getTargetConstantFromBasePtr(Load->getBasePtr());
4968}
4969
4974
4975const Constant *
4977 assert(LD && "Unexpected null LoadSDNode");
4978 return getTargetConstantFromNode(LD);
4979}
4980
4982 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4983 SDValue Cond = N->getOperand(0);
4984 SDValue RHS = N->getOperand(2);
4985 EVT CondVT = Cond.getValueType();
4986 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4987 CondVT.getVectorElementType() == MVT::i1 &&
4988 ISD::isBuildVectorAllZeros(RHS.getNode());
4989}
4990
4991// Extract raw constant bits from constant pools.
4992static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4993 APInt &UndefElts,
4994 SmallVectorImpl<APInt> &EltBits,
4995 bool AllowWholeUndefs = true,
4996 bool AllowPartialUndefs = false) {
4997 assert(EltBits.empty() && "Expected an empty EltBits vector");
4998
5000
5001 EVT VT = Op.getValueType();
5002 unsigned SizeInBits = VT.getSizeInBits();
5003 unsigned NumElts = SizeInBits / EltSizeInBits;
5004
5005 // Can't split constant.
5006 if ((SizeInBits % EltSizeInBits) != 0)
5007 return false;
5008
5009 // Bitcast a source array of element bits to the target size.
5010 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5011 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5012 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5013 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5014 "Constant bit sizes don't match");
5015
5016 // Don't split if we don't allow undef bits.
5017 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5018 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5019 return false;
5020
5021 // If we're already the right size, don't bother bitcasting.
5022 if (NumSrcElts == NumElts) {
5023 UndefElts = UndefSrcElts;
5024 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5025 return true;
5026 }
5027
5028 // Extract all the undef/constant element data and pack into single bitsets.
5029 APInt UndefBits(SizeInBits, 0);
5030 APInt MaskBits(SizeInBits, 0);
5031
5032 for (unsigned i = 0; i != NumSrcElts; ++i) {
5033 unsigned BitOffset = i * SrcEltSizeInBits;
5034 if (UndefSrcElts[i])
5035 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5036 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5037 }
5038
5039 // Split the undef/constant single bitset data into the target elements.
5040 UndefElts = APInt(NumElts, 0);
5041 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5042
5043 for (unsigned i = 0; i != NumElts; ++i) {
5044 unsigned BitOffset = i * EltSizeInBits;
5045 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5046
5047 // Only treat an element as UNDEF if all bits are UNDEF.
5048 if (UndefEltBits.isAllOnes()) {
5049 if (!AllowWholeUndefs)
5050 return false;
5051 UndefElts.setBit(i);
5052 continue;
5053 }
5054
5055 // If only some bits are UNDEF then treat them as zero (or bail if not
5056 // supported).
5057 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5058 return false;
5059
5060 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5061 }
5062 return true;
5063 };
5064
5065 // Collect constant bits and insert into mask/undef bit masks.
5066 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5067 unsigned UndefBitIndex) {
5068 if (!Cst)
5069 return false;
5070 if (isa<UndefValue>(Cst)) {
5071 Undefs.setBit(UndefBitIndex);
5072 return true;
5073 }
5074 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5075 Mask = CInt->getValue();
5076 return true;
5077 }
5078 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5079 Mask = CFP->getValueAPF().bitcastToAPInt();
5080 return true;
5081 }
5082 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5083 Type *Ty = CDS->getType();
5084 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5085 Type *EltTy = CDS->getElementType();
5086 bool IsInteger = EltTy->isIntegerTy();
5087 bool IsFP =
5088 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5089 if (!IsInteger && !IsFP)
5090 return false;
5091 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5092 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5093 if (IsInteger)
5094 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5095 else
5096 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5097 I * EltBits);
5098 return true;
5099 }
5100 return false;
5101 };
5102
5103 // Handle UNDEFs.
5104 if (Op.isUndef()) {
5105 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5106 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5107 return CastBitData(UndefSrcElts, SrcEltBits);
5108 }
5109
5110 // Extract scalar constant bits.
5111 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5112 APInt UndefSrcElts = APInt::getZero(1);
5113 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5114 return CastBitData(UndefSrcElts, SrcEltBits);
5115 }
5116 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5117 APInt UndefSrcElts = APInt::getZero(1);
5118 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5119 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122
5123 // Extract constant bits from build vector.
5124 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5125 BitVector Undefs;
5126 SmallVector<APInt> SrcEltBits;
5127 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5128 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5129 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5130 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5131 if (Undefs[I])
5132 UndefSrcElts.setBit(I);
5133 return CastBitData(UndefSrcElts, SrcEltBits);
5134 }
5135 }
5136
5137 // Extract constant bits from constant pool vector.
5138 if (auto *Cst = getTargetConstantFromNode(Op)) {
5139 Type *CstTy = Cst->getType();
5140 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5141 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5142 return false;
5143
5144 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5145 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5146 if ((SizeInBits % SrcEltSizeInBits) != 0)
5147 return false;
5148
5149 APInt UndefSrcElts(NumSrcElts, 0);
5150 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5151 for (unsigned i = 0; i != NumSrcElts; ++i)
5152 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5153 UndefSrcElts, i))
5154 return false;
5155
5156 return CastBitData(UndefSrcElts, SrcEltBits);
5157 }
5158
5159 // Extract constant bits from a broadcasted constant pool scalar.
5160 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5161 EltSizeInBits <= VT.getScalarSizeInBits()) {
5162 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5163 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5164 return false;
5165
5166 SDValue Ptr = MemIntr->getBasePtr();
5168 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5169 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5170
5171 APInt UndefSrcElts(NumSrcElts, 0);
5172 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5173 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5174 if (UndefSrcElts[0])
5175 UndefSrcElts.setBits(0, NumSrcElts);
5176 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5177 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5178 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5179 return CastBitData(UndefSrcElts, SrcEltBits);
5180 }
5181 }
5182 }
5183
5184 // Extract constant bits from a subvector broadcast.
5185 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5186 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5187 SDValue Ptr = MemIntr->getBasePtr();
5188 // The source constant may be larger than the subvector broadcast,
5189 // ensure we extract the correct subvector constants.
5190 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5191 Type *CstTy = Cst->getType();
5192 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5193 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5194 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5195 (SizeInBits % SubVecSizeInBits) != 0)
5196 return false;
5197 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5198 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5199 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5200 APInt UndefSubElts(NumSubElts, 0);
5201 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5202 APInt(CstEltSizeInBits, 0));
5203 for (unsigned i = 0; i != NumSubElts; ++i) {
5204 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5205 UndefSubElts, i))
5206 return false;
5207 for (unsigned j = 1; j != NumSubVecs; ++j)
5208 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5209 }
5210 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5211 UndefSubElts);
5212 return CastBitData(UndefSubElts, SubEltBits);
5213 }
5214 }
5215
5216 // Extract a rematerialized scalar constant insertion.
5217 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5218 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5219 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5220 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222
5223 APInt UndefSrcElts(NumSrcElts, 0);
5224 SmallVector<APInt, 64> SrcEltBits;
5225 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5226 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5227 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5228 return CastBitData(UndefSrcElts, SrcEltBits);
5229 }
5230
5231 // Insert constant bits from a base and sub vector sources.
5232 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5233 // If bitcasts to larger elements we might lose track of undefs - don't
5234 // allow any to be safe.
5235 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5236 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5237
5238 APInt UndefSrcElts, UndefSubElts;
5239 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5240 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5241 UndefSubElts, EltSubBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs) &&
5244 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5245 UndefSrcElts, EltSrcBits,
5246 AllowWholeUndefs && AllowUndefs,
5247 AllowPartialUndefs && AllowUndefs)) {
5248 unsigned BaseIdx = Op.getConstantOperandVal(2);
5249 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5250 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5251 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5252 return CastBitData(UndefSrcElts, EltSrcBits);
5253 }
5254 }
5255
5256 // Extract constant bits from a subvector's source.
5257 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5258 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5259 EltBits, AllowWholeUndefs,
5260 AllowPartialUndefs)) {
5261 EVT SrcVT = Op.getOperand(0).getValueType();
5262 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5263 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5264 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5265 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5266 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5268 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5269
5270 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5271 if ((BaseIdx + NumSubElts) != NumSrcElts)
5272 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5273 if (BaseIdx != 0)
5274 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5275 return true;
5276 }
5277
5278 // Extract constant bits from shuffle node sources.
5279 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5280 // TODO - support shuffle through bitcasts.
5281 if (EltSizeInBits != VT.getScalarSizeInBits())
5282 return false;
5283
5284 ArrayRef<int> Mask = SVN->getMask();
5285 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5286 llvm::any_of(Mask, [](int M) { return M < 0; }))
5287 return false;
5288
5289 APInt UndefElts0, UndefElts1;
5290 SmallVector<APInt, 32> EltBits0, EltBits1;
5291 if (isAnyInRange(Mask, 0, NumElts) &&
5292 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5293 UndefElts0, EltBits0, AllowWholeUndefs,
5294 AllowPartialUndefs))
5295 return false;
5296 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5297 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5298 UndefElts1, EltBits1, AllowWholeUndefs,
5299 AllowPartialUndefs))
5300 return false;
5301
5302 UndefElts = APInt::getZero(NumElts);
5303 for (int i = 0; i != (int)NumElts; ++i) {
5304 int M = Mask[i];
5305 if (M < 0) {
5306 UndefElts.setBit(i);
5307 EltBits.push_back(APInt::getZero(EltSizeInBits));
5308 } else if (M < (int)NumElts) {
5309 if (UndefElts0[M])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits0[M]);
5312 } else {
5313 if (UndefElts1[M - NumElts])
5314 UndefElts.setBit(i);
5315 EltBits.push_back(EltBits1[M - NumElts]);
5316 }
5317 }
5318 return true;
5319 }
5320
5321 return false;
5322}
5323
5324namespace llvm {
5325namespace X86 {
5326bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5327 APInt UndefElts;
5328 SmallVector<APInt, 16> EltBits;
5330 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5331 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5332 int SplatIndex = -1;
5333 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5334 if (UndefElts[i])
5335 continue;
5336 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5337 SplatIndex = -1;
5338 break;
5339 }
5340 SplatIndex = i;
5341 }
5342 if (0 <= SplatIndex) {
5343 SplatVal = EltBits[SplatIndex];
5344 return true;
5345 }
5346 }
5347
5348 return false;
5349}
5350
5351int getRoundingModeX86(unsigned RM) {
5352 switch (static_cast<::llvm::RoundingMode>(RM)) {
5353 // clang-format off
5354 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5355 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5356 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5357 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5358 default:
5359 return X86::rmInvalid; // Invalid rounding mode
5360 }
5361}
5362
5363} // namespace X86
5364} // namespace llvm
5365
5367 unsigned MaskEltSizeInBits,
5369 APInt &UndefElts) {
5370 // Extract the raw target constant bits.
5371 SmallVector<APInt, 64> EltBits;
5372 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5373 EltBits, /* AllowWholeUndefs */ true,
5374 /* AllowPartialUndefs */ false))
5375 return false;
5376
5377 // Insert the extracted elements into the mask.
5378 for (const APInt &Elt : EltBits)
5379 RawMask.push_back(Elt.getZExtValue());
5380
5381 return true;
5382}
5383
5384static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5385 bool AllowUndefs) {
5386 APInt UndefElts;
5387 SmallVector<APInt, 64> EltBits;
5388 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5389 /*AllowWholeUndefs*/ AllowUndefs,
5390 /*AllowPartialUndefs*/ false))
5391 return false;
5392
5393 bool IsPow2OrUndef = true;
5394 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5395 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5396 return IsPow2OrUndef;
5397}
5398
5399// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5401 // TODO: don't always ignore oneuse constraints.
5402 V = peekThroughBitcasts(V);
5403 EVT VT = V.getValueType();
5404
5405 // Match not(xor X, -1) -> X.
5406 if (V.getOpcode() == ISD::XOR &&
5407 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5408 isAllOnesConstant(V.getOperand(1))))
5409 return V.getOperand(0);
5410
5411 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5412 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5413 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5414 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5415 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5417 V.getOperand(1));
5418 }
5419 }
5420
5421 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5422 if (V.getOpcode() == X86ISD::PCMPGT &&
5423 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5424 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5425 V.getOperand(0).hasOneUse()) {
5426 APInt UndefElts;
5427 SmallVector<APInt> EltBits;
5428 if (getTargetConstantBitsFromNode(V.getOperand(0),
5429 V.getScalarValueSizeInBits(), UndefElts,
5430 EltBits) &&
5431 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5432 // Don't fold min_signed_value -> (min_signed_value - 1)
5433 bool MinSigned = false;
5434 for (APInt &Elt : EltBits) {
5435 MinSigned |= Elt.isMinSignedValue();
5436 Elt -= 1;
5437 }
5438 if (!MinSigned) {
5439 SDLoc DL(V);
5440 MVT VT = V.getSimpleValueType();
5441 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5442 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5443 }
5444 }
5445 }
5446
5447 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5449 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5450 for (SDValue &CatOp : CatOps) {
5451 SDValue NotCat = IsNOT(CatOp, DAG);
5452 if (!NotCat)
5453 return SDValue();
5454 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5455 }
5456 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5457 }
5458
5459 // Match not(or(not(X),not(Y))) -> and(X, Y).
5460 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5461 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5462 // TODO: Handle cases with single NOT operand -> ANDNP
5463 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5464 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5465 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5466 DAG.getBitcast(VT, Op1));
5467 }
5468
5469 return SDValue();
5470}
5471
5472/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5473/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5474/// Note: This ignores saturation, so inputs must be checked first.
5476 bool Unary, unsigned NumStages = 1) {
5477 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5478 unsigned NumElts = VT.getVectorNumElements();
5479 unsigned NumLanes = VT.getSizeInBits() / 128;
5480 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5481 unsigned Offset = Unary ? 0 : NumElts;
5482 unsigned Repetitions = 1u << (NumStages - 1);
5483 unsigned Increment = 1u << NumStages;
5484 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5485
5486 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5487 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5488 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5489 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5490 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5491 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5492 }
5493 }
5494}
5495
5496// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5497static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5498 APInt &DemandedLHS, APInt &DemandedRHS) {
5499 int NumLanes = VT.getSizeInBits() / 128;
5500 int NumElts = DemandedElts.getBitWidth();
5501 int NumInnerElts = NumElts / 2;
5502 int NumEltsPerLane = NumElts / NumLanes;
5503 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5504
5505 DemandedLHS = APInt::getZero(NumInnerElts);
5506 DemandedRHS = APInt::getZero(NumInnerElts);
5507
5508 // Map DemandedElts to the packed operands.
5509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5510 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5511 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5512 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5513 if (DemandedElts[OuterIdx])
5514 DemandedLHS.setBit(InnerIdx);
5515 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5516 DemandedRHS.setBit(InnerIdx);
5517 }
5518 }
5519}
5520
5521// Split the demanded elts of a HADD/HSUB node between its operands.
5522static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5523 APInt &DemandedLHS, APInt &DemandedRHS) {
5525 DemandedLHS, DemandedRHS);
5526 DemandedLHS |= DemandedLHS << 1;
5527 DemandedRHS |= DemandedRHS << 1;
5528}
5529
5530/// Calculates the shuffle mask corresponding to the target-specific opcode.
5531/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5532/// operands in \p Ops, and returns true.
5533/// Sets \p IsUnary to true if only one source is used. Note that this will set
5534/// IsUnary for shuffles which use a single input multiple times, and in those
5535/// cases it will adjust the mask to only have indices within that single input.
5536/// It is an error to call this with non-empty Mask/Ops vectors.
5537static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5539 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5540 if (!isTargetShuffle(N.getOpcode()))
5541 return false;
5542
5543 MVT VT = N.getSimpleValueType();
5544 unsigned NumElems = VT.getVectorNumElements();
5545 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 APInt RawUndefs;
5548 uint64_t ImmN;
5549
5550 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5551 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5552
5553 IsUnary = false;
5554 bool IsFakeUnary = false;
5555 switch (N.getOpcode()) {
5556 case X86ISD::BLENDI:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeBLENDMask(NumElems, ImmN, Mask);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::SHUFP:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5566 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5567 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5568 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5569 break;
5570 case X86ISD::INSERTPS:
5571 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5572 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5573 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5574 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5575 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5576 break;
5577 case X86ISD::EXTRQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5580 isa<ConstantSDNode>(N.getOperand(2))) {
5581 int BitLen = N.getConstantOperandVal(1);
5582 int BitIdx = N.getConstantOperandVal(2);
5583 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5584 IsUnary = true;
5585 }
5586 break;
5587 case X86ISD::INSERTQI:
5588 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5590 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5591 isa<ConstantSDNode>(N.getOperand(3))) {
5592 int BitLen = N.getConstantOperandVal(2);
5593 int BitIdx = N.getConstantOperandVal(3);
5594 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5595 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5596 }
5597 break;
5598 case X86ISD::UNPCKH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5602 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5603 break;
5604 case X86ISD::UNPCKL:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::MOVHLPS:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVHLPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVLHPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVLHPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::VALIGN:
5623 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5624 "Only 32-bit and 64-bit elements are supported!");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodeVALIGNMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::PALIGNR:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5637 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5638 DecodePALIGNRMask(NumElems, ImmN, Mask);
5639 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5640 Ops.push_back(N.getOperand(1));
5641 Ops.push_back(N.getOperand(0));
5642 break;
5643 case X86ISD::VSHLDQ:
5644 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSLLDQMask(NumElems, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::VSRLDQ:
5651 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5652 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5653 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5654 DecodePSRLDQMask(NumElems, ImmN, Mask);
5655 IsUnary = true;
5656 break;
5657 case X86ISD::PSHUFD:
5658 case X86ISD::VPERMILPI:
5659 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5660 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5661 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5662 IsUnary = true;
5663 break;
5664 case X86ISD::PSHUFHW:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFLW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::VZEXT_MOVL:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 DecodeZeroMoveLowMask(NumElems, Mask);
5679 IsUnary = true;
5680 break;
5681 case X86ISD::VBROADCAST:
5682 // We only decode broadcasts of same-sized vectors, peeking through to
5683 // extracted subvectors is likely to cause hasOneUse issues with
5684 // SimplifyDemandedBits etc.
5685 if (N.getOperand(0).getValueType() == VT) {
5686 DecodeVectorBroadcast(NumElems, Mask);
5687 IsUnary = true;
5688 break;
5689 }
5690 return false;
5691 case X86ISD::VPERMILPV: {
5692 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5693 IsUnary = true;
5694 SDValue MaskNode = N.getOperand(1);
5695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5696 RawUndefs)) {
5697 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5698 break;
5699 }
5700 return false;
5701 }
5702 case X86ISD::PSHUFB: {
5703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5704 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5705 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5706 IsUnary = true;
5707 SDValue MaskNode = N.getOperand(1);
5708 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5709 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5710 break;
5711 }
5712 return false;
5713 }
5714 case X86ISD::VPERMI:
5715 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERMMask(NumElems, ImmN, Mask);
5718 IsUnary = true;
5719 break;
5720 case X86ISD::MOVSS:
5721 case X86ISD::MOVSD:
5722 case X86ISD::MOVSH:
5723 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5724 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5725 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5726 break;
5727 case X86ISD::VPERM2X128:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5730 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5731 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5732 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5733 break;
5734 case X86ISD::SHUF128:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5737 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5738 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5739 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5740 break;
5741 case X86ISD::MOVSLDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVSLDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::MOVSHDUP:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 DecodeMOVSHDUPMask(NumElems, Mask);
5749 IsUnary = true;
5750 break;
5751 case X86ISD::MOVDDUP:
5752 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5753 DecodeMOVDDUPMask(NumElems, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::VPERMIL2: {
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5759 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5760 SDValue MaskNode = N.getOperand(2);
5761 SDValue CtrlNode = N.getOperand(3);
5762 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5763 unsigned CtrlImm = CtrlOp->getZExtValue();
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5765 RawUndefs)) {
5766 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5767 Mask);
5768 break;
5769 }
5770 }
5771 return false;
5772 }
5773 case X86ISD::VPPERM: {
5774 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5777 SDValue MaskNode = N.getOperand(2);
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5779 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 case X86ISD::VPERMV: {
5785 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5786 IsUnary = true;
5787 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5788 Ops.push_back(N.getOperand(1));
5789 SDValue MaskNode = N.getOperand(0);
5790 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5791 RawUndefs)) {
5792 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5793 break;
5794 }
5795 return false;
5796 }
5797 case X86ISD::VPERMV3: {
5798 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5800 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5801 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5802 Ops.push_back(N.getOperand(0));
5803 Ops.push_back(N.getOperand(2));
5804 SDValue MaskNode = N.getOperand(1);
5805 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5806 RawUndefs)) {
5807 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5808 break;
5809 }
5810 return false;
5811 }
5812 default:
5813 llvm_unreachable("unknown target shuffle node");
5814 }
5815
5816 // Empty mask indicates the decode failed.
5817 if (Mask.empty())
5818 return false;
5819
5820 // Check if we're getting a shuffle mask with zero'd elements.
5821 if (!AllowSentinelZero && isAnyZero(Mask))
5822 return false;
5823
5824 // If we have a fake unary shuffle, the shuffle mask is spread across two
5825 // inputs that are actually the same node. Re-map the mask to always point
5826 // into the first input.
5827 if (IsFakeUnary)
5828 for (int &M : Mask)
5829 if (M >= (int)Mask.size())
5830 M -= Mask.size();
5831
5832 // If we didn't already add operands in the opcode-specific code, default to
5833 // adding 1 or 2 operands starting at 0.
5834 if (Ops.empty()) {
5835 Ops.push_back(N.getOperand(0));
5836 if (!IsUnary || IsFakeUnary)
5837 Ops.push_back(N.getOperand(1));
5838 }
5839
5840 return true;
5841}
5842
5843// Wrapper for getTargetShuffleMask with InUnary;
5844static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5846 SmallVectorImpl<int> &Mask) {
5847 bool IsUnary;
5848 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5849}
5850
5851/// Compute whether each element of a shuffle is zeroable.
5852///
5853/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5854/// Either it is an undef element in the shuffle mask, the element of the input
5855/// referenced is undef, or the element of the input referenced is known to be
5856/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5857/// as many lanes with this technique as possible to simplify the remaining
5858/// shuffle.
5860 SDValue V1, SDValue V2,
5861 APInt &KnownUndef, APInt &KnownZero) {
5862 int Size = Mask.size();
5863 KnownUndef = KnownZero = APInt::getZero(Size);
5864
5865 V1 = peekThroughBitcasts(V1);
5866 V2 = peekThroughBitcasts(V2);
5867
5868 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5869 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5870
5871 int VectorSizeInBits = V1.getValueSizeInBits();
5872 int ScalarSizeInBits = VectorSizeInBits / Size;
5873 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5874
5875 for (int i = 0; i < Size; ++i) {
5876 int M = Mask[i];
5877 // Handle the easy cases.
5878 if (M < 0) {
5879 KnownUndef.setBit(i);
5880 continue;
5881 }
5882 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5883 KnownZero.setBit(i);
5884 continue;
5885 }
5886
5887 // Determine shuffle input and normalize the mask.
5888 SDValue V = M < Size ? V1 : V2;
5889 M %= Size;
5890
5891 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5892 if (V.getOpcode() != ISD::BUILD_VECTOR)
5893 continue;
5894
5895 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5896 // the (larger) source element must be UNDEF/ZERO.
5897 if ((Size % V.getNumOperands()) == 0) {
5898 int Scale = Size / V->getNumOperands();
5899 SDValue Op = V.getOperand(M / Scale);
5900 if (Op.isUndef())
5901 KnownUndef.setBit(i);
5902 if (X86::isZeroNode(Op))
5903 KnownZero.setBit(i);
5904 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5905 APInt Val = Cst->getAPIntValue();
5906 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5907 if (Val == 0)
5908 KnownZero.setBit(i);
5909 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5910 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5911 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5912 if (Val == 0)
5913 KnownZero.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // If the BUILD_VECTOR has more elements then all the (smaller) source
5919 // elements must be UNDEF or ZERO.
5920 if ((V.getNumOperands() % Size) == 0) {
5921 int Scale = V->getNumOperands() / Size;
5922 bool AllUndef = true;
5923 bool AllZero = true;
5924 for (int j = 0; j < Scale; ++j) {
5925 SDValue Op = V.getOperand((M * Scale) + j);
5926 AllUndef &= Op.isUndef();
5927 AllZero &= X86::isZeroNode(Op);
5928 }
5929 if (AllUndef)
5930 KnownUndef.setBit(i);
5931 if (AllZero)
5932 KnownZero.setBit(i);
5933 continue;
5934 }
5935 }
5936}
5937
5938/// Decode a target shuffle mask and inputs and see if any values are
5939/// known to be undef or zero from their inputs.
5940/// Returns true if the target shuffle mask was decoded.
5941/// FIXME: Merge this with computeZeroableShuffleElements?
5944 APInt &KnownUndef, APInt &KnownZero) {
5945 bool IsUnary;
5946 if (!isTargetShuffle(N.getOpcode()))
5947 return false;
5948
5949 MVT VT = N.getSimpleValueType();
5950 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5951 return false;
5952
5953 int Size = Mask.size();
5954 SDValue V1 = Ops[0];
5955 SDValue V2 = IsUnary ? V1 : Ops[1];
5956 KnownUndef = KnownZero = APInt::getZero(Size);
5957
5958 V1 = peekThroughBitcasts(V1);
5959 V2 = peekThroughBitcasts(V2);
5960
5961 assert((VT.getSizeInBits() % Size) == 0 &&
5962 "Illegal split of shuffle value type");
5963 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5964
5965 // Extract known constant input data.
5966 APInt UndefSrcElts[2];
5967 SmallVector<APInt, 32> SrcEltBits[2];
5968 bool IsSrcConstant[2] = {
5969 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5970 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false),
5972 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5973 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5974 /*AllowPartialUndefs*/ false)};
5975
5976 for (int i = 0; i < Size; ++i) {
5977 int M = Mask[i];
5978
5979 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5980 if (M < 0) {
5981 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5982 if (SM_SentinelUndef == M)
5983 KnownUndef.setBit(i);
5984 if (SM_SentinelZero == M)
5985 KnownZero.setBit(i);
5986 continue;
5987 }
5988
5989 // Determine shuffle input and normalize the mask.
5990 unsigned SrcIdx = M / Size;
5991 SDValue V = M < Size ? V1 : V2;
5992 M %= Size;
5993
5994 // We are referencing an UNDEF input.
5995 if (V.isUndef()) {
5996 KnownUndef.setBit(i);
5997 continue;
5998 }
5999
6000 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6001 // TODO: We currently only set UNDEF for integer types - floats use the same
6002 // registers as vectors and many of the scalar folded loads rely on the
6003 // SCALAR_TO_VECTOR pattern.
6004 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6005 (Size % V.getValueType().getVectorNumElements()) == 0) {
6006 int Scale = Size / V.getValueType().getVectorNumElements();
6007 int Idx = M / Scale;
6008 if (Idx != 0 && !VT.isFloatingPoint())
6009 KnownUndef.setBit(i);
6010 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6011 KnownZero.setBit(i);
6012 continue;
6013 }
6014
6015 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6016 // base vectors.
6017 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6018 SDValue Vec = V.getOperand(0);
6019 int NumVecElts = Vec.getValueType().getVectorNumElements();
6020 if (Vec.isUndef() && Size == NumVecElts) {
6021 int Idx = V.getConstantOperandVal(2);
6022 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6023 if (M < Idx || (Idx + NumSubElts) <= M)
6024 KnownUndef.setBit(i);
6025 }
6026 continue;
6027 }
6028
6029 // Attempt to extract from the source's constant bits.
6030 if (IsSrcConstant[SrcIdx]) {
6031 if (UndefSrcElts[SrcIdx][M])
6032 KnownUndef.setBit(i);
6033 else if (SrcEltBits[SrcIdx][M] == 0)
6034 KnownZero.setBit(i);
6035 }
6036 }
6037
6038 assert(VT.getVectorNumElements() == (unsigned)Size &&
6039 "Different mask size from vector size!");
6040 return true;
6041}
6042
6043// Replace target shuffle mask elements with known undef/zero sentinels.
6045 const APInt &KnownUndef,
6046 const APInt &KnownZero,
6047 bool ResolveKnownZeros= true) {
6048 unsigned NumElts = Mask.size();
6049 assert(KnownUndef.getBitWidth() == NumElts &&
6050 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6051
6052 for (unsigned i = 0; i != NumElts; ++i) {
6053 if (KnownUndef[i])
6054 Mask[i] = SM_SentinelUndef;
6055 else if (ResolveKnownZeros && KnownZero[i])
6056 Mask[i] = SM_SentinelZero;
6057 }
6058}
6059
6060// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6062 APInt &KnownUndef,
6063 APInt &KnownZero) {
6064 unsigned NumElts = Mask.size();
6065 KnownUndef = KnownZero = APInt::getZero(NumElts);
6066
6067 for (unsigned i = 0; i != NumElts; ++i) {
6068 int M = Mask[i];
6069 if (SM_SentinelUndef == M)
6070 KnownUndef.setBit(i);
6071 if (SM_SentinelZero == M)
6072 KnownZero.setBit(i);
6073 }
6074}
6075
6076// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6078 SDValue Cond, bool IsBLENDV = false) {
6079 EVT CondVT = Cond.getValueType();
6080 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6081 unsigned NumElts = CondVT.getVectorNumElements();
6082
6083 APInt UndefElts;
6084 SmallVector<APInt, 32> EltBits;
6085 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6086 /*AllowWholeUndefs*/ true,
6087 /*AllowPartialUndefs*/ false))
6088 return false;
6089
6090 Mask.resize(NumElts, SM_SentinelUndef);
6091
6092 for (int i = 0; i != (int)NumElts; ++i) {
6093 Mask[i] = i;
6094 // Arbitrarily choose from the 2nd operand if the select condition element
6095 // is undef.
6096 // TODO: Can we do better by matching patterns such as even/odd?
6097 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6098 (IsBLENDV && EltBits[i].isNonNegative()))
6099 Mask[i] += NumElts;
6100 }
6101
6102 return true;
6103}
6104
6105// Forward declaration (for getFauxShuffleMask recursive check).
6106static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts);
6111
6112// Attempt to decode ops that could be represented as a shuffle mask.
6113// The decoded shuffle mask may contain a different number of elements to the
6114// destination value type.
6115// TODO: Merge into getTargetShuffleInputs()
6116static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6119 const SelectionDAG &DAG, unsigned Depth,
6120 bool ResolveKnownElts) {
6121 Mask.clear();
6122 Ops.clear();
6123
6124 MVT VT = N.getSimpleValueType();
6125 unsigned NumElts = VT.getVectorNumElements();
6126 unsigned NumSizeInBits = VT.getSizeInBits();
6127 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6128 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6129 return false;
6130 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6131 unsigned NumSizeInBytes = NumSizeInBits / 8;
6132 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6133
6134 unsigned Opcode = N.getOpcode();
6135 switch (Opcode) {
6136 case ISD::VECTOR_SHUFFLE: {
6137 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6138 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6139 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6140 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6141 Ops.push_back(N.getOperand(0));
6142 Ops.push_back(N.getOperand(1));
6143 return true;
6144 }
6145 return false;
6146 }
6147 case ISD::AND:
6148 case X86ISD::ANDNP: {
6149 // Attempt to decode as a per-byte mask.
6150 APInt UndefElts;
6151 SmallVector<APInt, 32> EltBits;
6152 SDValue N0 = N.getOperand(0);
6153 SDValue N1 = N.getOperand(1);
6154 bool IsAndN = (X86ISD::ANDNP == Opcode);
6155 uint64_t ZeroMask = IsAndN ? 255 : 0;
6156 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6157 /*AllowWholeUndefs*/ false,
6158 /*AllowPartialUndefs*/ false))
6159 return false;
6160 // We can't assume an undef src element gives an undef dst - the other src
6161 // might be zero.
6162 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6163 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6164 const APInt &ByteBits = EltBits[i];
6165 if (ByteBits != 0 && ByteBits != 255)
6166 return false;
6167 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6168 }
6169 Ops.push_back(IsAndN ? N1 : N0);
6170 return true;
6171 }
6172 case ISD::OR: {
6173 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6174 // is a valid shuffle index.
6175 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6176 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6177 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6178 return false;
6179
6180 SmallVector<int, 64> SrcMask0, SrcMask1;
6181 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6184 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6185 Depth + 1, true) ||
6186 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6187 Depth + 1, true))
6188 return false;
6189
6190 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6191 SmallVector<int, 64> Mask0, Mask1;
6192 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6193 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6194 for (int i = 0; i != (int)MaskSize; ++i) {
6195 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6196 // loops converting between OR and BLEND shuffles due to
6197 // canWidenShuffleElements merging away undef elements, meaning we
6198 // fail to recognise the OR as the undef element isn't known zero.
6199 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6200 Mask.push_back(SM_SentinelZero);
6201 else if (Mask1[i] == SM_SentinelZero)
6202 Mask.push_back(i);
6203 else if (Mask0[i] == SM_SentinelZero)
6204 Mask.push_back(i + MaskSize);
6205 else
6206 return false;
6207 }
6208 Ops.push_back(N.getOperand(0));
6209 Ops.push_back(N.getOperand(1));
6210 return true;
6211 }
6212 case ISD::CONCAT_VECTORS: {
6213 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6214 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6215 if (NumBitsPerElt == 64) {
6216 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6217 for (unsigned M = 0; M != NumSubElts; ++M)
6218 Mask.push_back((I * NumElts) + M);
6219 Ops.push_back(N.getOperand(I));
6220 }
6221 return true;
6222 }
6223 return false;
6224 }
6225 case ISD::INSERT_SUBVECTOR: {
6226 SDValue Src = N.getOperand(0);
6227 SDValue Sub = N.getOperand(1);
6228 EVT SubVT = Sub.getValueType();
6229 unsigned NumSubElts = SubVT.getVectorNumElements();
6230 uint64_t InsertIdx = N.getConstantOperandVal(2);
6231 // Subvector isn't demanded - just return the base vector.
6232 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.end(), 0);
6235 Ops.push_back(Src);
6236 return true;
6237 }
6238 // Handle CONCAT(SUB0, SUB1).
6239 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6240 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6241 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6242 Src.getOperand(0).isUndef() &&
6243 Src.getOperand(1).getValueType() == SubVT &&
6244 Src.getConstantOperandVal(2) == 0 &&
6245 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6246 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6247 Mask.resize(NumElts);
6248 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6249 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6250 Ops.push_back(Src.getOperand(1));
6251 Ops.push_back(Sub);
6252 return true;
6253 }
6254 if (!N->isOnlyUserOf(Sub.getNode()))
6255 return false;
6256
6257 SmallVector<int, 64> SubMask;
6258 SmallVector<SDValue, 2> SubInputs;
6260 EVT SubSrcVT = SubSrc.getValueType();
6261 if (!SubSrcVT.isVector())
6262 return false;
6263
6264 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6265 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6266 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6267 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6268 SDValue SubSrcSrc = SubSrc.getOperand(0);
6269 unsigned NumSubSrcSrcElts =
6270 SubSrcSrc.getValueType().getVectorNumElements();
6271 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6272 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6273 "Subvector valuetype mismatch");
6274 InsertIdx *= (MaxElts / NumElts);
6275 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6276 NumSubElts *= (MaxElts / NumElts);
6277 bool SrcIsUndef = Src.isUndef();
6278 for (int i = 0; i != (int)MaxElts; ++i)
6279 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6280 for (int i = 0; i != (int)NumSubElts; ++i)
6281 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6282 if (!SrcIsUndef)
6283 Ops.push_back(Src);
6284 Ops.push_back(SubSrcSrc);
6285 return true;
6286 }
6287
6288 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6289 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6290 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6291 Depth + 1, ResolveKnownElts))
6292 return false;
6293
6294 // Subvector shuffle inputs must not be larger than the subvector.
6295 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6296 return SubVT.getFixedSizeInBits() <
6297 SubInput.getValueSizeInBits().getFixedValue();
6298 }))
6299 return false;
6300
6301 if (SubMask.size() != NumSubElts) {
6302 assert(((SubMask.size() % NumSubElts) == 0 ||
6303 (NumSubElts % SubMask.size()) == 0) &&
6304 "Illegal submask scale");
6305 if ((NumSubElts % SubMask.size()) == 0) {
6306 int Scale = NumSubElts / SubMask.size();
6307 SmallVector<int, 64> ScaledSubMask;
6308 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6309 SubMask = ScaledSubMask;
6310 } else {
6311 int Scale = SubMask.size() / NumSubElts;
6312 NumSubElts = SubMask.size();
6313 NumElts *= Scale;
6314 InsertIdx *= Scale;
6315 }
6316 }
6317 Ops.push_back(Src);
6318 Ops.append(SubInputs.begin(), SubInputs.end());
6319 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6320 Mask.append(NumElts, SM_SentinelZero);
6321 else
6322 for (int i = 0; i != (int)NumElts; ++i)
6323 Mask.push_back(i);
6324 for (int i = 0; i != (int)NumSubElts; ++i) {
6325 int M = SubMask[i];
6326 if (0 <= M) {
6327 int InputIdx = M / NumSubElts;
6328 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6329 }
6330 Mask[i + InsertIdx] = M;
6331 }
6332 return true;
6333 }
6334 case X86ISD::PINSRB:
6335 case X86ISD::PINSRW:
6338 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6339 // vector, for matching src/dst vector types.
6340 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6341
6342 unsigned DstIdx = 0;
6343 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6344 // Check we have an in-range constant insertion index.
6345 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6346 N.getConstantOperandAPInt(2).uge(NumElts))
6347 return false;
6348 DstIdx = N.getConstantOperandVal(2);
6349
6350 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6351 if (X86::isZeroNode(Scl)) {
6352 Ops.push_back(N.getOperand(0));
6353 for (unsigned i = 0; i != NumElts; ++i)
6354 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6355 return true;
6356 }
6357 }
6358
6359 // Peek through trunc/aext/zext/bitcast.
6360 // TODO: aext shouldn't require SM_SentinelZero padding.
6361 // TODO: handle shift of scalars.
6362 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6363 while (Scl.getOpcode() == ISD::TRUNCATE ||
6364 Scl.getOpcode() == ISD::ANY_EXTEND ||
6365 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6366 (Scl.getOpcode() == ISD::BITCAST &&
6369 Scl = Scl.getOperand(0);
6370 MinBitsPerElt =
6371 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6372 }
6373 if ((MinBitsPerElt % 8) != 0)
6374 return false;
6375
6376 // Attempt to find the source vector the scalar was extracted from.
6377 SDValue SrcExtract;
6378 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6379 Scl.getOpcode() == X86ISD::PEXTRW ||
6380 Scl.getOpcode() == X86ISD::PEXTRB) &&
6381 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6382 SrcExtract = Scl;
6383 }
6384 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6385 return false;
6386
6387 SDValue SrcVec = SrcExtract.getOperand(0);
6388 EVT SrcVT = SrcVec.getValueType();
6389 if (!SrcVT.getScalarType().isByteSized())
6390 return false;
6391 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6392 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6393 unsigned DstByte = DstIdx * NumBytesPerElt;
6394 MinBitsPerElt =
6395 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6396
6397 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6398 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6399 Ops.push_back(SrcVec);
6400 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6401 } else {
6402 Ops.push_back(SrcVec);
6403 Ops.push_back(N.getOperand(0));
6404 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6405 Mask.push_back(NumSizeInBytes + i);
6406 }
6407
6408 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6409 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6410 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6411 Mask[DstByte + i] = SrcByte + i;
6412 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6413 Mask[DstByte + i] = SM_SentinelZero;
6414 return true;
6415 }
6416 case X86ISD::PACKSS:
6417 case X86ISD::PACKUS: {
6418 SDValue N0 = N.getOperand(0);
6419 SDValue N1 = N.getOperand(1);
6420 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6422 "Unexpected input value type");
6423
6424 APInt EltsLHS, EltsRHS;
6425 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6426
6427 // If we know input saturation won't happen (or we don't care for particular
6428 // lanes), we can treat this as a truncation shuffle.
6429 bool Offset0 = false, Offset1 = false;
6430 if (Opcode == X86ISD::PACKSS) {
6431 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6432 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6433 (!(N1.isUndef() || EltsRHS.isZero()) &&
6434 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6435 return false;
6436 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6437 // PACKSS then it was likely being used for sign-extension for a
6438 // truncation, so just peek through and adjust the mask accordingly.
6439 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6440 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6441 Offset0 = true;
6442 N0 = N0.getOperand(0);
6443 }
6444 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6445 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6446 Offset1 = true;
6447 N1 = N1.getOperand(0);
6448 }
6449 } else {
6450 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6451 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6452 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6453 (!(N1.isUndef() || EltsRHS.isZero()) &&
6454 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6455 return false;
6456 }
6457
6458 bool IsUnary = (N0 == N1);
6459
6460 Ops.push_back(N0);
6461 if (!IsUnary)
6462 Ops.push_back(N1);
6463
6464 createPackShuffleMask(VT, Mask, IsUnary);
6465
6466 if (Offset0 || Offset1) {
6467 for (int &M : Mask)
6468 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6469 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6470 ++M;
6471 }
6472 return true;
6473 }
6474 case ISD::VSELECT:
6475 case X86ISD::BLENDV: {
6476 SDValue Cond = N.getOperand(0);
6477 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6478 Ops.push_back(N.getOperand(1));
6479 Ops.push_back(N.getOperand(2));
6480 return true;
6481 }
6482 return false;
6483 }
6484 case X86ISD::VTRUNC: {
6485 SDValue Src = N.getOperand(0);
6486 EVT SrcVT = Src.getValueType();
6487 if (SrcVT.getSizeInBits() != NumSizeInBits)
6488 return false;
6489 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6490 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6491 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6492 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6493 for (unsigned i = 0; i != NumSrcElts; ++i)
6494 Mask.push_back(i * Scale);
6495 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6496 Ops.push_back(Src);
6497 return true;
6498 }
6499 case ISD::SHL:
6500 case ISD::SRL: {
6501 APInt UndefElts;
6502 SmallVector<APInt, 32> EltBits;
6503 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6504 UndefElts, EltBits,
6505 /*AllowWholeUndefs*/ true,
6506 /*AllowPartialUndefs*/ false))
6507 return false;
6508
6509 // We can only decode 'whole byte' bit shifts as shuffles.
6510 for (unsigned I = 0; I != NumElts; ++I)
6511 if (DemandedElts[I] && !UndefElts[I] &&
6512 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6513 return false;
6514
6515 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6516 Ops.push_back(N.getOperand(0));
6517
6518 for (unsigned I = 0; I != NumElts; ++I) {
6519 if (!DemandedElts[I] || UndefElts[I])
6520 continue;
6521 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6522 unsigned Lo = I * NumBytesPerElt;
6523 unsigned Hi = Lo + NumBytesPerElt;
6524 // Clear mask to all zeros and insert the shifted byte indices.
6525 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6526 if (ISD::SHL == Opcode)
6527 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6528 else
6529 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6530 Lo + ByteShift);
6531 }
6532 return true;
6533 }
6534 case X86ISD::VSHLI:
6535 case X86ISD::VSRLI: {
6536 uint64_t ShiftVal = N.getConstantOperandVal(1);
6537 // Out of range bit shifts are guaranteed to be zero.
6538 if (NumBitsPerElt <= ShiftVal) {
6539 Mask.append(NumElts, SM_SentinelZero);
6540 return true;
6541 }
6542
6543 // We can only decode 'whole byte' bit shifts as shuffles.
6544 if ((ShiftVal % 8) != 0)
6545 break;
6546
6547 uint64_t ByteShift = ShiftVal / 8;
6548 Ops.push_back(N.getOperand(0));
6549
6550 // Clear mask to all zeros and insert the shifted byte indices.
6551 Mask.append(NumSizeInBytes, SM_SentinelZero);
6552
6553 if (X86ISD::VSHLI == Opcode) {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j] = i + j - ByteShift;
6557 } else {
6558 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6559 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6560 Mask[i + j - ByteShift] = i + j;
6561 }
6562 return true;
6563 }
6564 case X86ISD::VROTLI:
6565 case X86ISD::VROTRI: {
6566 // We can only decode 'whole byte' bit rotates as shuffles.
6567 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6568 if ((RotateVal % 8) != 0)
6569 return false;
6570 Ops.push_back(N.getOperand(0));
6571 int Offset = RotateVal / 8;
6572 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6573 for (int i = 0; i != (int)NumElts; ++i) {
6574 int BaseIdx = i * NumBytesPerElt;
6575 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6576 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6577 }
6578 }
6579 return true;
6580 }
6581 case X86ISD::VBROADCAST: {
6582 SDValue Src = N.getOperand(0);
6583 if (!Src.getSimpleValueType().isVector()) {
6584 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6585 !isNullConstant(Src.getOperand(1)) ||
6586 Src.getOperand(0).getValueType().getScalarType() !=
6587 VT.getScalarType())
6588 return false;
6589 Src = Src.getOperand(0);
6590 }
6591 Ops.push_back(Src);
6592 Mask.append(NumElts, 0);
6593 return true;
6594 }
6596 SDValue Src = N.getOperand(0);
6597 EVT SrcVT = Src.getValueType();
6598 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6599
6600 // Extended source must be a simple vector.
6601 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6602 (NumBitsPerSrcElt % 8) != 0)
6603 return false;
6604
6605 // We can only handle all-signbits extensions.
6606 APInt DemandedSrcElts =
6607 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6608 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6609 return false;
6610
6611 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6612 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6613 for (unsigned I = 0; I != NumElts; ++I)
6614 Mask.append(Scale, I);
6615 Ops.push_back(Src);
6616 return true;
6617 }
6618 case ISD::ZERO_EXTEND:
6619 case ISD::ANY_EXTEND:
6622 SDValue Src = N.getOperand(0);
6623 EVT SrcVT = Src.getValueType();
6624
6625 // Extended source must be a simple vector.
6626 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6627 (SrcVT.getScalarSizeInBits() % 8) != 0)
6628 return false;
6629
6630 bool IsAnyExtend =
6631 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6632 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6633 IsAnyExtend, Mask);
6634 Ops.push_back(Src);
6635 return true;
6636 }
6637 }
6638
6639 return false;
6640}
6641
6642/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6644 SmallVectorImpl<int> &Mask) {
6645 int MaskWidth = Mask.size();
6646 SmallVector<SDValue, 16> UsedInputs;
6647 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6648 int lo = UsedInputs.size() * MaskWidth;
6649 int hi = lo + MaskWidth;
6650
6651 // Strip UNDEF input usage.
6652 if (Inputs[i].isUndef())
6653 for (int &M : Mask)
6654 if ((lo <= M) && (M < hi))
6655 M = SM_SentinelUndef;
6656
6657 // Check for unused inputs.
6658 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6659 for (int &M : Mask)
6660 if (lo <= M)
6661 M -= MaskWidth;
6662 continue;
6663 }
6664
6665 // Check for repeated inputs.
6666 bool IsRepeat = false;
6667 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6668 if (UsedInputs[j] != Inputs[i])
6669 continue;
6670 for (int &M : Mask)
6671 if (lo <= M)
6672 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6673 IsRepeat = true;
6674 break;
6675 }
6676 if (IsRepeat)
6677 continue;
6678
6679 UsedInputs.push_back(Inputs[i]);
6680 }
6681 Inputs = UsedInputs;
6682}
6683
6684/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6685/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6686/// Returns true if the target shuffle mask was decoded.
6687static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6690 APInt &KnownUndef, APInt &KnownZero,
6691 const SelectionDAG &DAG, unsigned Depth,
6692 bool ResolveKnownElts) {
6694 return false; // Limit search depth.
6695
6696 EVT VT = Op.getValueType();
6697 if (!VT.isSimple() || !VT.isVector())
6698 return false;
6699
6700 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6701 if (ResolveKnownElts)
6702 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6703 return true;
6704 }
6705 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6706 ResolveKnownElts)) {
6707 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6708 return true;
6709 }
6710 return false;
6711}
6712
6713static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6716 const SelectionDAG &DAG, unsigned Depth,
6717 bool ResolveKnownElts) {
6718 APInt KnownUndef, KnownZero;
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6720 KnownZero, DAG, Depth, ResolveKnownElts);
6721}
6722
6725 const SelectionDAG &DAG, unsigned Depth = 0,
6726 bool ResolveKnownElts = true) {
6727 EVT VT = Op.getValueType();
6728 if (!VT.isSimple() || !VT.isVector())
6729 return false;
6730
6731 unsigned NumElts = Op.getValueType().getVectorNumElements();
6732 APInt DemandedElts = APInt::getAllOnes(NumElts);
6733 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6734 ResolveKnownElts);
6735}
6736
6737// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6738static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6739 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6740 SelectionDAG &DAG) {
6741 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6742 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6743 "Unknown broadcast load type");
6744
6745 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6746 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6747 return SDValue();
6748
6751 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6752 SDValue Ops[] = {Mem->getChain(), Ptr};
6753 SDValue BcstLd = DAG.getMemIntrinsicNode(
6754 Opcode, DL, Tys, Ops, MemVT,
6756 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6757 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6758 return BcstLd;
6759}
6760
6761/// Returns the scalar element that will make up the i'th
6762/// element of the result of the vector shuffle.
6763static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6764 SelectionDAG &DAG, unsigned Depth) {
6766 return SDValue(); // Limit search depth.
6767
6768 EVT VT = Op.getValueType();
6769 unsigned Opcode = Op.getOpcode();
6770 unsigned NumElems = VT.getVectorNumElements();
6771
6772 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6773 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6774 int Elt = SV->getMaskElt(Index);
6775
6776 if (Elt < 0)
6777 return DAG.getUNDEF(VT.getVectorElementType());
6778
6779 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6780 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6781 }
6782
6783 // Recurse into target specific vector shuffles to find scalars.
6784 if (isTargetShuffle(Opcode)) {
6785 MVT ShufVT = VT.getSimpleVT();
6786 MVT ShufSVT = ShufVT.getVectorElementType();
6787 int NumElems = (int)ShufVT.getVectorNumElements();
6788 SmallVector<int, 16> ShuffleMask;
6790 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6791 return SDValue();
6792
6793 int Elt = ShuffleMask[Index];
6794 if (Elt == SM_SentinelZero)
6795 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6796 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6797 if (Elt == SM_SentinelUndef)
6798 return DAG.getUNDEF(ShufSVT);
6799
6800 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6801 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6802 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6803 }
6804
6805 // Recurse into insert_subvector base/sub vector to find scalars.
6806 if (Opcode == ISD::INSERT_SUBVECTOR) {
6807 SDValue Vec = Op.getOperand(0);
6808 SDValue Sub = Op.getOperand(1);
6809 uint64_t SubIdx = Op.getConstantOperandVal(2);
6810 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6811
6812 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6813 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6814 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6815 }
6816
6817 // Recurse into concat_vectors sub vector to find scalars.
6818 if (Opcode == ISD::CONCAT_VECTORS) {
6819 EVT SubVT = Op.getOperand(0).getValueType();
6820 unsigned NumSubElts = SubVT.getVectorNumElements();
6821 uint64_t SubIdx = Index / NumSubElts;
6822 uint64_t SubElt = Index % NumSubElts;
6823 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6824 }
6825
6826 // Recurse into extract_subvector src vector to find scalars.
6827 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6828 SDValue Src = Op.getOperand(0);
6829 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6830 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6831 }
6832
6833 // We only peek through bitcasts of the same vector width.
6834 if (Opcode == ISD::BITCAST) {
6835 SDValue Src = Op.getOperand(0);
6836 EVT SrcVT = Src.getValueType();
6837 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6838 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6839 return SDValue();
6840 }
6841
6842 // Actual nodes that may contain scalar elements
6843
6844 // For insert_vector_elt - either return the index matching scalar or recurse
6845 // into the base vector.
6846 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6847 isa<ConstantSDNode>(Op.getOperand(2))) {
6848 if (Op.getConstantOperandAPInt(2) == Index)
6849 return Op.getOperand(1);
6850 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6851 }
6852
6853 if (Opcode == ISD::SCALAR_TO_VECTOR)
6854 return (Index == 0) ? Op.getOperand(0)
6855 : DAG.getUNDEF(VT.getVectorElementType());
6856
6857 if (Opcode == ISD::BUILD_VECTOR)
6858 return Op.getOperand(Index);
6859
6860 return SDValue();
6861}
6862
6863// Use PINSRB/PINSRW/PINSRD to create a build vector.
6865 const APInt &NonZeroMask,
6866 unsigned NumNonZero, unsigned NumZero,
6867 SelectionDAG &DAG,
6868 const X86Subtarget &Subtarget) {
6869 MVT VT = Op.getSimpleValueType();
6870 unsigned NumElts = VT.getVectorNumElements();
6871 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6872 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6873 "Illegal vector insertion");
6874
6875 SDValue V;
6876 bool First = true;
6877
6878 for (unsigned i = 0; i < NumElts; ++i) {
6879 bool IsNonZero = NonZeroMask[i];
6880 if (!IsNonZero)
6881 continue;
6882
6883 // If the build vector contains zeros or our first insertion is not the
6884 // first index then insert into zero vector to break any register
6885 // dependency else use SCALAR_TO_VECTOR.
6886 if (First) {
6887 First = false;
6888 if (NumZero || 0 != i)
6889 V = getZeroVector(VT, Subtarget, DAG, DL);
6890 else {
6891 assert(0 == i && "Expected insertion into zero-index");
6892 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6893 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6894 V = DAG.getBitcast(VT, V);
6895 continue;
6896 }
6897 }
6898 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6899 DAG.getVectorIdxConstant(i, DL));
6900 }
6901
6902 return V;
6903}
6904
6905/// Custom lower build_vector of v16i8.
6907 const APInt &NonZeroMask,
6908 unsigned NumNonZero, unsigned NumZero,
6909 SelectionDAG &DAG,
6910 const X86Subtarget &Subtarget) {
6911 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 return SDValue();
6913
6914 // SSE4.1 - use PINSRB to insert each byte directly.
6915 if (Subtarget.hasSSE41())
6916 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6917 DAG, Subtarget);
6918
6919 SDValue V;
6920
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6923 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6924 !NonZeroMask.extractBits(2, 2).isZero()) {
6925 for (unsigned I = 0; I != 4; ++I) {
6926 if (!NonZeroMask[I])
6927 continue;
6928 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6929 if (I != 0)
6930 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6931 DAG.getConstant(I * 8, DL, MVT::i8));
6932 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6933 }
6934 assert(V && "Failed to fold v16i8 vector to zero");
6935 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6936 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6937 V = DAG.getBitcast(MVT::v8i16, V);
6938 }
6939 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6940 bool ThisIsNonZero = NonZeroMask[i];
6941 bool NextIsNonZero = NonZeroMask[i + 1];
6942 if (!ThisIsNonZero && !NextIsNonZero)
6943 continue;
6944
6945 SDValue Elt;
6946 if (ThisIsNonZero) {
6947 if (NumZero || NextIsNonZero)
6948 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6949 else
6950 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6951 }
6952
6953 if (NextIsNonZero) {
6954 SDValue NextElt = Op.getOperand(i + 1);
6955 if (i == 0 && NumZero)
6956 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6957 else
6958 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6959 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6960 DAG.getConstant(8, DL, MVT::i8));
6961 if (ThisIsNonZero)
6962 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6963 else
6964 Elt = NextElt;
6965 }
6966
6967 // If our first insertion is not the first index or zeros are needed, then
6968 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6969 // elements undefined).
6970 if (!V) {
6971 if (i != 0 || NumZero)
6972 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6973 else {
6974 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6975 V = DAG.getBitcast(MVT::v8i16, V);
6976 continue;
6977 }
6978 }
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6981 DAG.getVectorIdxConstant(i / 2, DL));
6982 }
6983
6984 return DAG.getBitcast(MVT::v16i8, V);
6985}
6986
6987/// Custom lower build_vector of v8i16.
6989 const APInt &NonZeroMask,
6990 unsigned NumNonZero, unsigned NumZero,
6991 SelectionDAG &DAG,
6992 const X86Subtarget &Subtarget) {
6993 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6994 return SDValue();
6995
6996 // Use PINSRW to insert each byte directly.
6997 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6998 Subtarget);
6999}
7000
7001/// Custom lower build_vector of v4i32 or v4f32.
7003 SelectionDAG &DAG,
7004 const X86Subtarget &Subtarget) {
7005 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7006 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7007 // Because we're creating a less complicated build vector here, we may enable
7008 // further folding of the MOVDDUP via shuffle transforms.
7009 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7010 Op.getOperand(0) == Op.getOperand(2) &&
7011 Op.getOperand(1) == Op.getOperand(3) &&
7012 Op.getOperand(0) != Op.getOperand(1)) {
7013 MVT VT = Op.getSimpleValueType();
7014 MVT EltVT = VT.getVectorElementType();
7015 // Create a new build vector with the first 2 elements followed by undef
7016 // padding, bitcast to v2f64, duplicate, and bitcast back.
7017 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7018 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7019 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7020 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7021 return DAG.getBitcast(VT, Dup);
7022 }
7023
7024 // Find all zeroable elements.
7025 std::bitset<4> Zeroable, Undefs;
7026 for (int i = 0; i < 4; ++i) {
7027 SDValue Elt = Op.getOperand(i);
7028 Undefs[i] = Elt.isUndef();
7029 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7030 }
7031 assert(Zeroable.size() - Zeroable.count() > 1 &&
7032 "We expect at least two non-zero elements!");
7033
7034 // We only know how to deal with build_vector nodes where elements are either
7035 // zeroable or extract_vector_elt with constant index.
7036 SDValue FirstNonZero;
7037 unsigned FirstNonZeroIdx;
7038 for (unsigned i = 0; i < 4; ++i) {
7039 if (Zeroable[i])
7040 continue;
7041 SDValue Elt = Op.getOperand(i);
7042 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7044 return SDValue();
7045 // Make sure that this node is extracting from a 128-bit vector.
7046 MVT VT = Elt.getOperand(0).getSimpleValueType();
7047 if (!VT.is128BitVector())
7048 return SDValue();
7049 if (!FirstNonZero.getNode()) {
7050 FirstNonZero = Elt;
7051 FirstNonZeroIdx = i;
7052 }
7053 }
7054
7055 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7056 SDValue V1 = FirstNonZero.getOperand(0);
7057 MVT VT = V1.getSimpleValueType();
7058
7059 // See if this build_vector can be lowered as a blend with zero.
7060 SDValue Elt;
7061 unsigned EltMaskIdx, EltIdx;
7062 int Mask[4];
7063 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7064 if (Zeroable[EltIdx]) {
7065 // The zero vector will be on the right hand side.
7066 Mask[EltIdx] = EltIdx+4;
7067 continue;
7068 }
7069
7070 Elt = Op->getOperand(EltIdx);
7071 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7072 EltMaskIdx = Elt.getConstantOperandVal(1);
7073 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7074 break;
7075 Mask[EltIdx] = EltIdx;
7076 }
7077
7078 if (EltIdx == 4) {
7079 // Let the shuffle legalizer deal with blend operations.
7080 SDValue VZeroOrUndef = (Zeroable == Undefs)
7081 ? DAG.getUNDEF(VT)
7082 : getZeroVector(VT, Subtarget, DAG, DL);
7083 if (V1.getSimpleValueType() != VT)
7084 V1 = DAG.getBitcast(VT, V1);
7085 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7086 }
7087
7088 // See if we can lower this build_vector to a INSERTPS.
7089 if (!Subtarget.hasSSE41())
7090 return SDValue();
7091
7092 SDValue V2 = Elt.getOperand(0);
7093 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7094 V1 = SDValue();
7095
7096 bool CanFold = true;
7097 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7098 if (Zeroable[i])
7099 continue;
7100
7101 SDValue Current = Op->getOperand(i);
7102 SDValue SrcVector = Current->getOperand(0);
7103 if (!V1.getNode())
7104 V1 = SrcVector;
7105 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7106 }
7107
7108 if (!CanFold)
7109 return SDValue();
7110
7111 assert(V1.getNode() && "Expected at least two non-zero elements!");
7112 if (V1.getSimpleValueType() != MVT::v4f32)
7113 V1 = DAG.getBitcast(MVT::v4f32, V1);
7114 if (V2.getSimpleValueType() != MVT::v4f32)
7115 V2 = DAG.getBitcast(MVT::v4f32, V2);
7116
7117 // Ok, we can emit an INSERTPS instruction.
7118 unsigned ZMask = Zeroable.to_ulong();
7119
7120 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7121 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7122 SDValue Result =
7123 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7124 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7125 return DAG.getBitcast(VT, Result);
7126}
7127
7128/// Return a vector logical shift node.
7129static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7130 SelectionDAG &DAG, const TargetLowering &TLI,
7131 const SDLoc &dl) {
7132 assert(VT.is128BitVector() && "Unknown type for VShift");
7133 MVT ShVT = MVT::v16i8;
7134 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7135 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7136 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7137 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7138 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7139}
7140
7142 SelectionDAG &DAG) {
7143
7144 // Check if the scalar load can be widened into a vector load. And if
7145 // the address is "base + cst" see if the cst can be "absorbed" into
7146 // the shuffle mask.
7148 SDValue Ptr = LD->getBasePtr();
7149 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7150 return SDValue();
7151 EVT PVT = LD->getValueType(0);
7152 if (PVT != MVT::i32 && PVT != MVT::f32)
7153 return SDValue();
7154
7155 int FI = -1;
7156 int64_t Offset = 0;
7158 FI = FINode->getIndex();
7159 Offset = 0;
7160 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7161 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7162 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7163 Offset = Ptr.getConstantOperandVal(1);
7164 Ptr = Ptr.getOperand(0);
7165 } else {
7166 return SDValue();
7167 }
7168
7169 // FIXME: 256-bit vector instructions don't require a strict alignment,
7170 // improve this code to support it better.
7171 Align RequiredAlign(VT.getSizeInBits() / 8);
7172 SDValue Chain = LD->getChain();
7173 // Make sure the stack object alignment is at least 16 or 32.
7175 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7176 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7177 if (MFI.isFixedObjectIndex(FI)) {
7178 // Can't change the alignment. FIXME: It's possible to compute
7179 // the exact stack offset and reference FI + adjust offset instead.
7180 // If someone *really* cares about this. That's the way to implement it.
7181 return SDValue();
7182 } else {
7183 MFI.setObjectAlignment(FI, RequiredAlign);
7184 }
7185 }
7186
7187 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7188 // Ptr + (Offset & ~15).
7189 if (Offset < 0)
7190 return SDValue();
7191 if ((Offset % RequiredAlign.value()) & 3)
7192 return SDValue();
7193 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7194 if (StartOffset) {
7195 SDLoc DL(Ptr);
7196 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7197 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7198 }
7199
7200 int EltNo = (Offset - StartOffset) >> 2;
7201 unsigned NumElems = VT.getVectorNumElements();
7202
7203 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7204 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7205 LD->getPointerInfo().getWithOffset(StartOffset));
7206
7207 SmallVector<int, 8> Mask(NumElems, EltNo);
7208
7209 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7210 }
7211
7212 return SDValue();
7213}
7214
7215// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7216static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7217 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7218 auto *BaseLd = cast<LoadSDNode>(Elt);
7219 if (!BaseLd->isSimple())
7220 return false;
7221 Ld = BaseLd;
7222 ByteOffset = 0;
7223 return true;
7224 }
7225
7226 switch (Elt.getOpcode()) {
7227 case ISD::BITCAST:
7228 case ISD::TRUNCATE:
7230 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7231 case ISD::SRL:
7232 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7233 uint64_t Amt = AmtC->getZExtValue();
7234 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7235 ByteOffset += Amt / 8;
7236 return true;
7237 }
7238 }
7239 break;
7241 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7242 SDValue Src = Elt.getOperand(0);
7243 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7244 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7245 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7246 findEltLoadSrc(Src, Ld, ByteOffset)) {
7247 uint64_t Idx = IdxC->getZExtValue();
7248 ByteOffset += Idx * (SrcSizeInBits / 8);
7249 return true;
7250 }
7251 }
7252 break;
7253 }
7254
7255 return false;
7256}
7257
7258/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7259/// elements can be replaced by a single large load which has the same value as
7260/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7261///
7262/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7264 const SDLoc &DL, SelectionDAG &DAG,
7265 const X86Subtarget &Subtarget,
7266 bool IsAfterLegalize) {
7267 if ((VT.getScalarSizeInBits() % 8) != 0)
7268 return SDValue();
7269
7270 unsigned NumElems = Elts.size();
7271
7272 int LastLoadedElt = -1;
7273 APInt LoadMask = APInt::getZero(NumElems);
7274 APInt ZeroMask = APInt::getZero(NumElems);
7275 APInt UndefMask = APInt::getZero(NumElems);
7276
7277 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7278 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7279
7280 // For each element in the initializer, see if we've found a load, zero or an
7281 // undef.
7282 for (unsigned i = 0; i < NumElems; ++i) {
7283 SDValue Elt = peekThroughBitcasts(Elts[i]);
7284 if (!Elt.getNode())
7285 return SDValue();
7286 if (Elt.isUndef()) {
7287 UndefMask.setBit(i);
7288 continue;
7289 }
7291 ZeroMask.setBit(i);
7292 continue;
7293 }
7294
7295 // Each loaded element must be the correct fractional portion of the
7296 // requested vector load.
7297 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7298 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7299 return SDValue();
7300
7301 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7302 return SDValue();
7303 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7304 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7305 return SDValue();
7306
7307 LoadMask.setBit(i);
7308 LastLoadedElt = i;
7309 }
7310 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7311 NumElems &&
7312 "Incomplete element masks");
7313
7314 // Handle Special Cases - all undef or undef/zero.
7315 if (UndefMask.popcount() == NumElems)
7316 return DAG.getUNDEF(VT);
7317 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7318 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7319 : DAG.getConstantFP(0.0, DL, VT);
7320
7321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7322 int FirstLoadedElt = LoadMask.countr_zero();
7323 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7324 EVT EltBaseVT = EltBase.getValueType();
7325 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7326 "Register/Memory size mismatch");
7327 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7328 assert(LDBase && "Did not find base load for merging consecutive loads");
7329 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7330 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7331 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7332 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7333 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7334
7335 // TODO: Support offsetting the base load.
7336 if (ByteOffsets[FirstLoadedElt] != 0)
7337 return SDValue();
7338
7339 // Check to see if the element's load is consecutive to the base load
7340 // or offset from a previous (already checked) load.
7341 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7342 LoadSDNode *Ld = Loads[EltIdx];
7343 int64_t ByteOffset = ByteOffsets[EltIdx];
7344 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7345 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7346 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7347 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7348 }
7349 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7350 EltIdx - FirstLoadedElt);
7351 };
7352
7353 // Consecutive loads can contain UNDEFS but not ZERO elements.
7354 // Consecutive loads with UNDEFs and ZEROs elements require a
7355 // an additional shuffle stage to clear the ZERO elements.
7356 bool IsConsecutiveLoad = true;
7357 bool IsConsecutiveLoadWithZeros = true;
7358 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7359 if (LoadMask[i]) {
7360 if (!CheckConsecutiveLoad(LDBase, i)) {
7361 IsConsecutiveLoad = false;
7362 IsConsecutiveLoadWithZeros = false;
7363 break;
7364 }
7365 } else if (ZeroMask[i]) {
7366 IsConsecutiveLoad = false;
7367 }
7368 }
7369
7370 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7371 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7372 assert(LDBase->isSimple() &&
7373 "Cannot merge volatile or atomic loads.");
7374 SDValue NewLd =
7375 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7376 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7377 for (auto *LD : Loads)
7378 if (LD)
7379 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7380 return NewLd;
7381 };
7382
7383 // Check if the base load is entirely dereferenceable.
7384 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7385 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7386
7387 // LOAD - all consecutive load/undefs (must start/end with a load or be
7388 // entirely dereferenceable). If we have found an entire vector of loads and
7389 // undefs, then return a large load of the entire vector width starting at the
7390 // base pointer. If the vector contains zeros, then attempt to shuffle those
7391 // elements.
7392 if (FirstLoadedElt == 0 &&
7393 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7394 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7395 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7396 return SDValue();
7397
7398 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7399 // will lower to regular temporal loads and use the cache.
7400 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7401 VT.is256BitVector() && !Subtarget.hasInt256())
7402 return SDValue();
7403
7404 if (NumElems == 1)
7405 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7406
7407 if (!ZeroMask)
7408 return CreateLoad(VT, LDBase);
7409
7410 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7411 // vector and a zero vector to clear out the zero elements.
7412 if (!IsAfterLegalize && VT.isVector()) {
7413 unsigned NumMaskElts = VT.getVectorNumElements();
7414 if ((NumMaskElts % NumElems) == 0) {
7415 unsigned Scale = NumMaskElts / NumElems;
7416 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7417 for (unsigned i = 0; i < NumElems; ++i) {
7418 if (UndefMask[i])
7419 continue;
7420 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7421 for (unsigned j = 0; j != Scale; ++j)
7422 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7423 }
7424 SDValue V = CreateLoad(VT, LDBase);
7425 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7426 : DAG.getConstantFP(0.0, DL, VT);
7427 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7428 }
7429 }
7430 }
7431
7432 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7433 if (VT.is256BitVector() || VT.is512BitVector()) {
7434 unsigned HalfNumElems = NumElems / 2;
7435 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7436 EVT HalfVT =
7437 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7438 SDValue HalfLD =
7439 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7440 DAG, Subtarget, IsAfterLegalize);
7441 if (HalfLD)
7442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7443 HalfLD, DAG.getVectorIdxConstant(0, DL));
7444 }
7445 }
7446
7447 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7448 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7449 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7450 LoadSizeInBits == 64) &&
7451 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7452 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7453 : MVT::getIntegerVT(LoadSizeInBits);
7454 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7455 // Allow v4f32 on SSE1 only targets.
7456 // FIXME: Add more isel patterns so we can just use VT directly.
7457 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7458 VecVT = MVT::v4f32;
7459 if (TLI.isTypeLegal(VecVT)) {
7460 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7461 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7462 SDValue ResNode = DAG.getMemIntrinsicNode(
7463 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7465 for (auto *LD : Loads)
7466 if (LD)
7467 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7468 return DAG.getBitcast(VT, ResNode);
7469 }
7470 }
7471
7472 // BROADCAST - match the smallest possible repetition pattern, load that
7473 // scalar/subvector element and then broadcast to the entire vector.
7474 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7475 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7476 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7477 unsigned RepeatSize = SubElems * BaseSizeInBits;
7478 unsigned ScalarSize = std::min(RepeatSize, 64u);
7479 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7480 continue;
7481
7482 // Don't attempt a 1:N subvector broadcast - it should be caught by
7483 // combineConcatVectorOps, else will cause infinite loops.
7484 if (RepeatSize > ScalarSize && SubElems == 1)
7485 continue;
7486
7487 bool Match = true;
7488 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7489 for (unsigned i = 0; i != NumElems && Match; ++i) {
7490 if (!LoadMask[i])
7491 continue;
7492 SDValue Elt = peekThroughBitcasts(Elts[i]);
7493 if (RepeatedLoads[i % SubElems].isUndef())
7494 RepeatedLoads[i % SubElems] = Elt;
7495 else
7496 Match &= (RepeatedLoads[i % SubElems] == Elt);
7497 }
7498
7499 // We must have loads at both ends of the repetition.
7500 Match &= !RepeatedLoads.front().isUndef();
7501 Match &= !RepeatedLoads.back().isUndef();
7502 if (!Match)
7503 continue;
7504
7505 EVT RepeatVT =
7506 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7507 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7508 : EVT::getFloatingPointVT(ScalarSize);
7509 if (RepeatSize > ScalarSize)
7510 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7511 RepeatSize / ScalarSize);
7512 EVT BroadcastVT =
7513 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7514 VT.getSizeInBits() / ScalarSize);
7515 if (TLI.isTypeLegal(BroadcastVT)) {
7516 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7517 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7518 SDValue Broadcast = RepeatLoad;
7519 if (RepeatSize > ScalarSize) {
7520 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7521 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7522 } else {
7523 if (!Subtarget.hasAVX2() &&
7525 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7526 Subtarget,
7527 /*AssumeSingleUse=*/true))
7528 return SDValue();
7529 Broadcast =
7530 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7531 }
7532 return DAG.getBitcast(VT, Broadcast);
7533 }
7534 }
7535 }
7536 }
7537
7538 return SDValue();
7539}
7540
7541// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7542// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7543// are consecutive, non-overlapping, and in the right order.
7545 SelectionDAG &DAG,
7546 const X86Subtarget &Subtarget,
7547 bool IsAfterLegalize) {
7549 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7550 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7551 Elts.push_back(Elt);
7552 continue;
7553 }
7554 return SDValue();
7555 }
7556 assert(Elts.size() == VT.getVectorNumElements());
7557 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7558 IsAfterLegalize);
7559}
7560
7562 const APInt &Undefs, LLVMContext &C) {
7563 unsigned ScalarSize = VT.getScalarSizeInBits();
7564 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7565
7566 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7567 if (VT.isFloatingPoint()) {
7568 if (ScalarSize == 16)
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7570 if (ScalarSize == 32)
7571 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7572 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7573 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7574 }
7575 return Constant::getIntegerValue(Ty, Val);
7576 };
7577
7578 SmallVector<Constant *, 32> ConstantVec;
7579 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7580 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7581 : getConstantScalar(Bits[I]));
7582
7583 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7584}
7585
7586static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7587 unsigned SplatBitSize, LLVMContext &C) {
7588 unsigned ScalarSize = VT.getScalarSizeInBits();
7589
7590 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7591 if (VT.isFloatingPoint()) {
7592 if (ScalarSize == 16)
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7594 if (ScalarSize == 32)
7595 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7596 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7597 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7598 }
7599 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7600 };
7601
7602 if (ScalarSize == SplatBitSize)
7603 return getConstantScalar(SplatValue);
7604
7605 unsigned NumElm = SplatBitSize / ScalarSize;
7606 SmallVector<Constant *, 32> ConstantVec;
7607 for (unsigned I = 0; I != NumElm; ++I) {
7608 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7609 ConstantVec.push_back(getConstantScalar(Val));
7610 }
7611 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7612}
7613
7615 for (auto *U : N->users()) {
7616 unsigned Opc = U->getOpcode();
7617 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7618 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7619 return false;
7620 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7621 return false;
7622 if (isTargetShuffle(Opc))
7623 return true;
7624 if (Opc == ISD::BITCAST) // Ignore bitcasts
7625 return isFoldableUseOfShuffle(U);
7626 if (N->hasOneUse()) {
7627 // TODO, there may be some general way to know if a SDNode can
7628 // be folded. We now only know whether an MI is foldable.
7629 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7630 return false;
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637// If the node has a single use by a VSELECT then AVX512 targets may be able to
7638// fold as a predicated instruction.
7639static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7640 unsigned SizeInBits = V.getValueSizeInBits();
7641 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7642 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7643 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7644 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7645 return true;
7646 }
7647 }
7648 return false;
7649}
7650
7651/// Attempt to use the vbroadcast instruction to generate a splat value
7652/// from a splat BUILD_VECTOR which uses:
7653/// a. A single scalar load, or a constant.
7654/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7655///
7656/// The VBROADCAST node is returned when a pattern is found,
7657/// or SDValue() otherwise.
7659 const SDLoc &dl,
7660 const X86Subtarget &Subtarget,
7661 SelectionDAG &DAG) {
7662 // VBROADCAST requires AVX.
7663 // TODO: Splats could be generated for non-AVX CPUs using SSE
7664 // instructions, but there's less potential gain for only 128-bit vectors.
7665 if (!Subtarget.hasAVX())
7666 return SDValue();
7667
7668 MVT VT = BVOp->getSimpleValueType(0);
7669 unsigned NumElts = VT.getVectorNumElements();
7670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7672 "Unsupported vector type for broadcast.");
7673
7674 // See if the build vector is a repeating sequence of scalars (inc. splat).
7675 SDValue Ld;
7676 BitVector UndefElements;
7677 SmallVector<SDValue, 16> Sequence;
7678 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7679 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7680 if (Sequence.size() == 1)
7681 Ld = Sequence[0];
7682 }
7683
7684 // Attempt to use VBROADCASTM
7685 // From this pattern:
7686 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7687 // b. t1 = (build_vector t0 t0)
7688 //
7689 // Create (VBROADCASTM v2i1 X)
7690 if (!Sequence.empty() && Subtarget.hasCDI()) {
7691 // If not a splat, are the upper sequence values zeroable?
7692 unsigned SeqLen = Sequence.size();
7693 bool UpperZeroOrUndef =
7694 SeqLen == 1 ||
7695 llvm::all_of(ArrayRef(Sequence).drop_front(),
7696 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7697 SDValue Op0 = Sequence[0];
7698 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7699 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7700 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7701 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7702 ? Op0.getOperand(0)
7703 : Op0.getOperand(0).getOperand(0);
7704 MVT MaskVT = BOperand.getSimpleValueType();
7705 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7706 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7707 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7708 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7709 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7710 unsigned Scale = 512 / VT.getSizeInBits();
7711 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7712 }
7713 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7714 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7715 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7716 return DAG.getBitcast(VT, Bcst);
7717 }
7718 }
7719 }
7720
7721 unsigned NumUndefElts = UndefElements.count();
7722 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7723 APInt SplatValue, Undef;
7724 unsigned SplatBitSize;
7725 bool HasUndef;
7726 // Check if this is a repeated constant pattern suitable for broadcasting.
7727 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7728 SplatBitSize > VT.getScalarSizeInBits() &&
7729 SplatBitSize < VT.getSizeInBits()) {
7730 // Avoid replacing with broadcast when it's a use of a shuffle
7731 // instruction to preserve the present custom lowering of shuffles.
7732 if (isFoldableUseOfShuffle(BVOp))
7733 return SDValue();
7734 // replace BUILD_VECTOR with broadcast of the repeated constants.
7735 LLVMContext *Ctx = DAG.getContext();
7736 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7737 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7738 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7739 // Load the constant scalar/subvector and broadcast it.
7740 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7741 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7742 SDValue CP = DAG.getConstantPool(C, PVT);
7743 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7744
7745 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7746 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7747 SDValue Ops[] = {DAG.getEntryNode(), CP};
7748 MachinePointerInfo MPI =
7750 SDValue Brdcst =
7752 MPI, Alignment, MachineMemOperand::MOLoad);
7753 return DAG.getBitcast(VT, Brdcst);
7754 }
7755 if (SplatBitSize > 64) {
7756 // Load the vector of constants and broadcast it.
7757 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7758 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7759 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7760 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7761 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7762 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7763 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7764 MachinePointerInfo MPI =
7767 Ops, VVT, MPI, Alignment,
7769 }
7770 }
7771
7772 // If we are moving a scalar into a vector (Ld must be set and all elements
7773 // but 1 are undef) and that operation is not obviously supported by
7774 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7775 // That's better than general shuffling and may eliminate a load to GPR and
7776 // move from scalar to vector register.
7777 if (!Ld || NumElts - NumUndefElts != 1)
7778 return SDValue();
7779 unsigned ScalarSize = Ld.getValueSizeInBits();
7780 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7781 return SDValue();
7782 }
7783
7784 bool ConstSplatVal =
7785 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7786 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7787
7788 // TODO: Handle broadcasts of non-constant sequences.
7789
7790 // Make sure that all of the users of a non-constant load are from the
7791 // BUILD_VECTOR node.
7792 // FIXME: Is the use count needed for non-constant, non-load case?
7793 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7794 return SDValue();
7795
7796 unsigned ScalarSize = Ld.getValueSizeInBits();
7797 bool IsGE256 = (VT.getSizeInBits() >= 256);
7798
7799 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7800 // instruction to save 8 or more bytes of constant pool data.
7801 // TODO: If multiple splats are generated to load the same constant,
7802 // it may be detrimental to overall size. There needs to be a way to detect
7803 // that condition to know if this is truly a size win.
7804 bool OptForSize = DAG.shouldOptForSize();
7805
7806 // Handle broadcasting a single constant scalar from the constant pool
7807 // into a vector.
7808 // On Sandybridge (no AVX2), it is still better to load a constant vector
7809 // from the constant pool and not to broadcast it from a scalar.
7810 // But override that restriction when optimizing for size.
7811 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7812 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7813 EVT CVT = Ld.getValueType();
7814 assert(!CVT.isVector() && "Must not broadcast a vector type");
7815
7816 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7817 // For size optimization, also splat v2f64 and v2i64, and for size opt
7818 // with AVX2, also splat i8 and i16.
7819 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7820 if (ScalarSize == 32 ||
7821 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7822 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7823 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7824 const Constant *C = nullptr;
7826 C = CI->getConstantIntValue();
7828 C = CF->getConstantFPValue();
7829
7830 assert(C && "Invalid constant type");
7831
7832 SDValue CP =
7834 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7835
7836 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7837 SDValue Ops[] = {DAG.getEntryNode(), CP};
7838 MachinePointerInfo MPI =
7840 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7841 MPI, Alignment, MachineMemOperand::MOLoad);
7842 }
7843 }
7844
7845 // Handle AVX2 in-register broadcasts.
7846 if (!IsLoad && Subtarget.hasInt256() &&
7847 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7848 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7849
7850 // The scalar source must be a normal load.
7851 if (!IsLoad)
7852 return SDValue();
7853
7854 // Make sure the non-chain result is only used by this build vector.
7855 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7856 return SDValue();
7857
7858 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7859 (Subtarget.hasVLX() && ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7871 // double since there is no vbroadcastsd xmm
7872 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7873 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7874 auto *LN = cast<LoadSDNode>(Ld);
7875 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7876 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7877 SDValue BCast =
7879 LN->getMemoryVT(), LN->getMemOperand());
7880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7881 return BCast;
7882 }
7883
7884 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7886
7887 // Unsupported broadcast.
7888 return SDValue();
7889}
7890
7891/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7892/// underlying vector and index.
7893///
7894/// Modifies \p ExtractedFromVec to the real vector and returns the real
7895/// index.
7896static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7897 SDValue ExtIdx) {
7898 int Idx = ExtIdx->getAsZExtVal();
7899 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7900 return Idx;
7901
7902 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7903 // lowered this:
7904 // (extract_vector_elt (v8f32 %1), Constant<6>)
7905 // to:
7906 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7907 // (extract_subvector (v8f32 %0), Constant<4>),
7908 // undef)
7909 // Constant<0>)
7910 // In this case the vector is the extract_subvector expression and the index
7911 // is 2, as specified by the shuffle.
7912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7913 SDValue ShuffleVec = SVOp->getOperand(0);
7914 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7915 assert(ShuffleVecVT.getVectorElementType() ==
7916 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7917
7918 int ShuffleIdx = SVOp->getMaskElt(Idx);
7919 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7920 ExtractedFromVec = ShuffleVec;
7921 return ShuffleIdx;
7922 }
7923 return Idx;
7924}
7925
7927 SelectionDAG &DAG) {
7928 MVT VT = Op.getSimpleValueType();
7929
7930 // Skip if insert_vec_elt is not supported.
7931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7933 return SDValue();
7934
7935 unsigned NumElems = Op.getNumOperands();
7936 SDValue VecIn1;
7937 SDValue VecIn2;
7938 SmallVector<unsigned, 4> InsertIndices;
7939 SmallVector<int, 8> Mask(NumElems, -1);
7940
7941 for (unsigned i = 0; i != NumElems; ++i) {
7942 unsigned Opc = Op.getOperand(i).getOpcode();
7943
7944 if (Opc == ISD::UNDEF)
7945 continue;
7946
7948 // Quit if more than 1 elements need inserting.
7949 if (InsertIndices.size() > 1)
7950 return SDValue();
7951
7952 InsertIndices.push_back(i);
7953 continue;
7954 }
7955
7956 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7957 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7958
7959 // Quit if non-constant index.
7960 if (!isa<ConstantSDNode>(ExtIdx))
7961 return SDValue();
7962 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7963
7964 // Quit if extracted from vector of different type.
7965 if (ExtractedFromVec.getValueType() != VT)
7966 return SDValue();
7967
7968 if (!VecIn1.getNode())
7969 VecIn1 = ExtractedFromVec;
7970 else if (VecIn1 != ExtractedFromVec) {
7971 if (!VecIn2.getNode())
7972 VecIn2 = ExtractedFromVec;
7973 else if (VecIn2 != ExtractedFromVec)
7974 // Quit if more than 2 vectors to shuffle
7975 return SDValue();
7976 }
7977
7978 if (ExtractedFromVec == VecIn1)
7979 Mask[i] = Idx;
7980 else if (ExtractedFromVec == VecIn2)
7981 Mask[i] = Idx + NumElems;
7982 }
7983
7984 if (!VecIn1.getNode())
7985 return SDValue();
7986
7987 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7988 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7989
7990 for (unsigned Idx : InsertIndices)
7991 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7992 DAG.getVectorIdxConstant(Idx, DL));
7993
7994 return NV;
7995}
7996
7997// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7999 const X86Subtarget &Subtarget) {
8000 MVT VT = Op.getSimpleValueType();
8001 MVT IVT =
8002 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8004 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8005 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8006 Op.getOperand(I)));
8007 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8008 return DAG.getBitcast(VT, Res);
8009}
8010
8011// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8013 SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8015
8016 MVT VT = Op.getSimpleValueType();
8017 assert((VT.getVectorElementType() == MVT::i1) &&
8018 "Unexpected type in LowerBUILD_VECTORvXi1!");
8019 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8020 ISD::isBuildVectorAllOnes(Op.getNode()))
8021 return Op;
8022
8023 uint64_t Immediate = 0;
8024 SmallVector<unsigned, 16> NonConstIdx;
8025 bool IsSplat = true;
8026 bool HasConstElts = false;
8027 int SplatIdx = -1;
8028 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8029 SDValue In = Op.getOperand(idx);
8030 if (In.isUndef())
8031 continue;
8032 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8033 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8034 HasConstElts = true;
8035 } else {
8036 NonConstIdx.push_back(idx);
8037 }
8038 if (SplatIdx < 0)
8039 SplatIdx = idx;
8040 else if (In != Op.getOperand(SplatIdx))
8041 IsSplat = false;
8042 }
8043
8044 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8045 if (IsSplat) {
8046 // The build_vector allows the scalar element to be larger than the vector
8047 // element type. We need to mask it to use as a condition unless we know
8048 // the upper bits are zero.
8049 // FIXME: Use computeKnownBits instead of checking specific opcode?
8050 SDValue Cond = Op.getOperand(SplatIdx);
8051 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8052 if (Cond.getOpcode() != ISD::SETCC)
8053 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8054 DAG.getConstant(1, dl, MVT::i8));
8055
8056 // Perform the select in the scalar domain so we can use cmov.
8057 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8058 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8059 DAG.getAllOnesConstant(dl, MVT::i32),
8060 DAG.getConstant(0, dl, MVT::i32));
8061 Select = DAG.getBitcast(MVT::v32i1, Select);
8062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8063 } else {
8064 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8065 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8066 DAG.getAllOnesConstant(dl, ImmVT),
8067 DAG.getConstant(0, dl, ImmVT));
8068 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8069 Select = DAG.getBitcast(VecVT, Select);
8070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8071 DAG.getVectorIdxConstant(0, dl));
8072 }
8073 }
8074
8075 // insert elements one by one
8076 SDValue DstVec;
8077 if (HasConstElts) {
8078 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8079 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8080 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8081 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8082 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8083 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8084 } else {
8085 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8086 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8087 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8088 DstVec = DAG.getBitcast(VecVT, Imm);
8089 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8090 DAG.getVectorIdxConstant(0, dl));
8091 }
8092 } else
8093 DstVec = DAG.getUNDEF(VT);
8094
8095 for (unsigned InsertIdx : NonConstIdx) {
8096 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8097 Op.getOperand(InsertIdx),
8098 DAG.getVectorIdxConstant(InsertIdx, dl));
8099 }
8100 return DstVec;
8101}
8102
8103LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8104 switch (Opcode) {
8105 case X86ISD::PACKSS:
8106 case X86ISD::PACKUS:
8107 case X86ISD::FHADD:
8108 case X86ISD::FHSUB:
8109 case X86ISD::HADD:
8110 case X86ISD::HSUB:
8111 return true;
8112 }
8113 return false;
8114}
8115
8116/// This is a helper function of LowerToHorizontalOp().
8117/// This function checks that the build_vector \p N in input implements a
8118/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8119/// may not match the layout of an x86 256-bit horizontal instruction.
8120/// In other words, if this returns true, then some extraction/insertion will
8121/// be required to produce a valid horizontal instruction.
8122///
8123/// Parameter \p Opcode defines the kind of horizontal operation to match.
8124/// For example, if \p Opcode is equal to ISD::ADD, then this function
8125/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8126/// is equal to ISD::SUB, then this function checks if this is a horizontal
8127/// arithmetic sub.
8128///
8129/// This function only analyzes elements of \p N whose indices are
8130/// in range [BaseIdx, LastIdx).
8131///
8132/// TODO: This function was originally used to match both real and fake partial
8133/// horizontal operations, but the index-matching logic is incorrect for that.
8134/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8135/// code because it is only used for partial h-op matching now?
8136static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8137 const SDLoc &DL, SelectionDAG &DAG,
8138 unsigned BaseIdx, unsigned LastIdx,
8139 SDValue &V0, SDValue &V1) {
8140 EVT VT = N->getValueType(0);
8141 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8142 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8143 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8144 "Invalid Vector in input!");
8145
8146 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8147 bool CanFold = true;
8148 unsigned ExpectedVExtractIdx = BaseIdx;
8149 unsigned NumElts = LastIdx - BaseIdx;
8150 V0 = DAG.getUNDEF(VT);
8151 V1 = DAG.getUNDEF(VT);
8152
8153 // Check if N implements a horizontal binop.
8154 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8155 SDValue Op = N->getOperand(i + BaseIdx);
8156
8157 // Skip UNDEFs.
8158 if (Op->isUndef()) {
8159 // Update the expected vector extract index.
8160 if (i * 2 == NumElts)
8161 ExpectedVExtractIdx = BaseIdx;
8162 ExpectedVExtractIdx += 2;
8163 continue;
8164 }
8165
8166 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8167
8168 if (!CanFold)
8169 break;
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173
8174 // Try to match the following pattern:
8175 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8176 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8178 Op0.getOperand(0) == Op1.getOperand(0) &&
8181 if (!CanFold)
8182 break;
8183
8184 unsigned I0 = Op0.getConstantOperandVal(1);
8185 unsigned I1 = Op1.getConstantOperandVal(1);
8186
8187 if (i * 2 < NumElts) {
8188 if (V0.isUndef()) {
8189 V0 = Op0.getOperand(0);
8190 if (V0.getValueType() != VT)
8191 return false;
8192 }
8193 } else {
8194 if (V1.isUndef()) {
8195 V1 = Op0.getOperand(0);
8196 if (V1.getValueType() != VT)
8197 return false;
8198 }
8199 if (i * 2 == NumElts)
8200 ExpectedVExtractIdx = BaseIdx;
8201 }
8202
8203 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8204 if (I0 == ExpectedVExtractIdx)
8205 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8206 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8207 // Try to match the following dag sequence:
8208 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8209 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8210 } else
8211 CanFold = false;
8212
8213 ExpectedVExtractIdx += 2;
8214 }
8215
8216 return CanFold;
8217}
8218
8219/// Emit a sequence of two 128-bit horizontal add/sub followed by
8220/// a concat_vector.
8221///
8222/// This is a helper function of LowerToHorizontalOp().
8223/// This function expects two 256-bit vectors called V0 and V1.
8224/// At first, each vector is split into two separate 128-bit vectors.
8225/// Then, the resulting 128-bit vectors are used to implement two
8226/// horizontal binary operations.
8227///
8228/// The kind of horizontal binary operation is defined by \p X86Opcode.
8229///
8230/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8231/// the two new horizontal binop.
8232/// When Mode is set, the first horizontal binop dag node would take as input
8233/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8234/// horizontal binop dag node would take as input the lower 128-bit of V1
8235/// and the upper 128-bit of V1.
8236/// Example:
8237/// HADD V0_LO, V0_HI
8238/// HADD V1_LO, V1_HI
8239///
8240/// Otherwise, the first horizontal binop dag node takes as input the lower
8241/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8242/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8243/// Example:
8244/// HADD V0_LO, V1_LO
8245/// HADD V0_HI, V1_HI
8246///
8247/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8248/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8249/// the upper 128-bits of the result.
8250static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8251 const SDLoc &DL, SelectionDAG &DAG,
8252 unsigned X86Opcode, bool Mode,
8253 bool isUndefLO, bool isUndefHI) {
8254 MVT VT = V0.getSimpleValueType();
8255 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8256 "Invalid nodes in input!");
8257
8258 unsigned NumElts = VT.getVectorNumElements();
8259 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8260 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8261 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8262 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8263 MVT NewVT = V0_LO.getSimpleValueType();
8264
8265 SDValue LO = DAG.getUNDEF(NewVT);
8266 SDValue HI = DAG.getUNDEF(NewVT);
8267
8268 if (Mode) {
8269 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8270 if (!isUndefLO && !V0->isUndef())
8271 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8272 if (!isUndefHI && !V1->isUndef())
8273 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8274 } else {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8278
8279 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8280 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8281 }
8282
8283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8284}
8285
8286/// Returns true iff \p BV builds a vector with the result equivalent to
8287/// the result of ADDSUB/SUBADD operation.
8288/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8289/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8290/// \p Opnd0 and \p Opnd1.
8292 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8293 SDValue &Opnd0, SDValue &Opnd1,
8294 unsigned &NumExtracts, bool &IsSubAdd,
8295 bool &HasAllowContract) {
8296 using namespace SDPatternMatch;
8297
8298 MVT VT = BV->getSimpleValueType(0);
8299 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8300 return false;
8301
8302 unsigned NumElts = VT.getVectorNumElements();
8303 SDValue InVec0 = DAG.getUNDEF(VT);
8304 SDValue InVec1 = DAG.getUNDEF(VT);
8305
8306 NumExtracts = 0;
8307 HasAllowContract = NumElts != 0;
8308
8309 // Odd-numbered elements in the input build vector are obtained from
8310 // adding/subtracting two integer/float elements.
8311 // Even-numbered elements in the input build vector are obtained from
8312 // subtracting/adding two integer/float elements.
8313 unsigned Opc[2] = {0, 0};
8314 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8315 SDValue Op = BV->getOperand(i);
8316
8317 // Skip 'undef' values.
8318 unsigned Opcode = Op.getOpcode();
8319 if (Opcode == ISD::UNDEF)
8320 continue;
8321
8322 // Early exit if we found an unexpected opcode.
8323 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8324 return false;
8325
8326 SDValue Op0 = Op.getOperand(0);
8327 SDValue Op1 = Op.getOperand(1);
8328
8329 // Try to match the following pattern:
8330 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8331 // Early exit if we cannot match that sequence.
8332 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8333 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8334 return false;
8335
8336 // We found a valid add/sub node, make sure its the same opcode as previous
8337 // elements for this parity.
8338 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8339 return false;
8340 Opc[i % 2] = Opcode;
8341
8342 // Update InVec0 and InVec1.
8343 if (InVec0.isUndef())
8344 InVec0 = Op0.getOperand(0);
8345 if (InVec1.isUndef())
8346 InVec1 = Op1.getOperand(0);
8347
8348 // Make sure that operands in input to each add/sub node always
8349 // come from a same pair of vectors.
8350 if (InVec0 != Op0.getOperand(0)) {
8351 if (Opcode == ISD::FSUB)
8352 return false;
8353
8354 // FADD is commutable. Try to commute the operands
8355 // and then test again.
8356 std::swap(Op0, Op1);
8357 if (InVec0 != Op0.getOperand(0))
8358 return false;
8359 }
8360
8361 if (InVec1 != Op1.getOperand(0))
8362 return false;
8363
8364 // Increment the number of extractions done.
8365 ++NumExtracts;
8366 HasAllowContract &= Op->getFlags().hasAllowContract();
8367 }
8368
8369 // Ensure we have found an opcode for both parities and that they are
8370 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8371 // inputs are undef.
8372 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8373 InVec0.isUndef() || InVec1.isUndef())
8374 return false;
8375
8376 IsSubAdd = Opc[0] == ISD::FADD;
8377
8378 Opnd0 = InVec0;
8379 Opnd1 = InVec1;
8380 return true;
8381}
8382
8383/// Returns true if is possible to fold MUL and an idiom that has already been
8384/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8385/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8386/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8387///
8388/// Prior to calling this function it should be known that there is some
8389/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8390/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8391/// before replacement of such SDNode with ADDSUB operation. Thus the number
8392/// of \p Opnd0 uses is expected to be equal to 2.
8393/// For example, this function may be called for the following IR:
8394/// %AB = fmul fast <2 x double> %A, %B
8395/// %Sub = fsub fast <2 x double> %AB, %C
8396/// %Add = fadd fast <2 x double> %AB, %C
8397/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8398/// <2 x i32> <i32 0, i32 3>
8399/// There is a def for %Addsub here, which potentially can be replaced by
8400/// X86ISD::ADDSUB operation:
8401/// %Addsub = X86ISD::ADDSUB %AB, %C
8402/// and such ADDSUB can further be replaced with FMADDSUB:
8403/// %Addsub = FMADDSUB %A, %B, %C.
8404///
8405/// The main reason why this method is called before the replacement of the
8406/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8407/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8408/// FMADDSUB is.
8409static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8410 SelectionDAG &DAG, SDValue &Opnd0,
8411 SDValue &Opnd1, SDValue &Opnd2,
8412 unsigned ExpectedUses,
8413 bool AllowSubAddOrAddSubContract) {
8414 if (Opnd0.getOpcode() != ISD::FMUL ||
8415 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8416 return false;
8417
8418 // FIXME: These checks must match the similar ones in
8419 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8420 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8421 // or MUL + ADDSUB to FMADDSUB.
8422 const TargetOptions &Options = DAG.getTarget().Options;
8423 bool AllowFusion =
8424 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8425 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8426 if (!AllowFusion)
8427 return false;
8428
8429 Opnd2 = Opnd1;
8430 Opnd1 = Opnd0.getOperand(1);
8431 Opnd0 = Opnd0.getOperand(0);
8432
8433 return true;
8434}
8435
8436/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8437/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8438/// X86ISD::FMSUBADD node.
8440 const SDLoc &DL,
8441 const X86Subtarget &Subtarget,
8442 SelectionDAG &DAG) {
8443 SDValue Opnd0, Opnd1;
8444 unsigned NumExtracts;
8445 bool IsSubAdd;
8446 bool HasAllowContract;
8447 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8448 HasAllowContract))
8449 return SDValue();
8450
8451 MVT VT = BV->getSimpleValueType(0);
8452
8453 // Try to generate X86ISD::FMADDSUB node here.
8454 SDValue Opnd2;
8455 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8456 HasAllowContract)) {
8457 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8458 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8459 }
8460
8461 // We only support ADDSUB.
8462 if (IsSubAdd)
8463 return SDValue();
8464
8465 // There are no known X86 targets with 512-bit ADDSUB instructions!
8466 // Convert to blend(fsub,fadd).
8467 if (VT.is512BitVector()) {
8468 SmallVector<int> Mask;
8469 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8470 Mask.push_back(I);
8471 Mask.push_back(I + E + 1);
8472 }
8473 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8474 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8475 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8476 }
8477
8478 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8479}
8480
8482 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8483 // Initialize outputs to known values.
8484 MVT VT = BV->getSimpleValueType(0);
8485 HOpcode = ISD::DELETED_NODE;
8486 V0 = DAG.getUNDEF(VT);
8487 V1 = DAG.getUNDEF(VT);
8488
8489 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8490 // half of the result is calculated independently from the 128-bit halves of
8491 // the inputs, so that makes the index-checking logic below more complicated.
8492 unsigned NumElts = VT.getVectorNumElements();
8493 unsigned GenericOpcode = ISD::DELETED_NODE;
8494 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8495 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8496 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8497 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8498 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8499 // Ignore undef elements.
8500 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8501 if (Op.isUndef())
8502 continue;
8503
8504 // If there's an opcode mismatch, we're done.
8505 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8506 return false;
8507
8508 // Initialize horizontal opcode.
8509 if (HOpcode == ISD::DELETED_NODE) {
8510 GenericOpcode = Op.getOpcode();
8511 switch (GenericOpcode) {
8512 // clang-format off
8513 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8514 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8515 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8516 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8517 default: return false;
8518 // clang-format on
8519 }
8520 }
8521
8522 SDValue Op0 = Op.getOperand(0);
8523 SDValue Op1 = Op.getOperand(1);
8524 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8526 Op0.getOperand(0) != Op1.getOperand(0) ||
8528 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8529 return false;
8530
8531 // The source vector is chosen based on which 64-bit half of the
8532 // destination vector is being calculated.
8533 if (j < NumEltsIn64Bits) {
8534 if (V0.isUndef())
8535 V0 = Op0.getOperand(0);
8536 } else {
8537 if (V1.isUndef())
8538 V1 = Op0.getOperand(0);
8539 }
8540
8541 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8542 if (SourceVec != Op0.getOperand(0))
8543 return false;
8544
8545 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8546 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8547 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8548 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8549 (j % NumEltsIn64Bits) * 2;
8550 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8551 continue;
8552
8553 // If this is not a commutative op, this does not match.
8554 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8555 return false;
8556
8557 // Addition is commutative, so try swapping the extract indexes.
8558 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8559 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8560 continue;
8561
8562 // Extract indexes do not match horizontal requirement.
8563 return false;
8564 }
8565 }
8566 // We matched. Opcode and operands are returned by reference as arguments.
8567 return true;
8568}
8569
8571 const SDLoc &DL, SelectionDAG &DAG,
8572 unsigned HOpcode, SDValue V0, SDValue V1) {
8573 // If either input vector is not the same size as the build vector,
8574 // extract/insert the low bits to the correct size.
8575 // This is free (examples: zmm --> xmm, xmm --> ymm).
8576 MVT VT = BV->getSimpleValueType(0);
8577 unsigned Width = VT.getSizeInBits();
8578 if (V0.getValueSizeInBits() > Width)
8579 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8580 else if (V0.getValueSizeInBits() < Width)
8581 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8582
8583 if (V1.getValueSizeInBits() > Width)
8584 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8585 else if (V1.getValueSizeInBits() < Width)
8586 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8587
8588 unsigned NumElts = VT.getVectorNumElements();
8589 APInt DemandedElts = APInt::getAllOnes(NumElts);
8590 for (unsigned i = 0; i != NumElts; ++i)
8591 if (BV->getOperand(i).isUndef())
8592 DemandedElts.clearBit(i);
8593
8594 // If we don't need the upper xmm, then perform as a xmm hop.
8595 unsigned HalfNumElts = NumElts / 2;
8596 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8597 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8598 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8599 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8600 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8601 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8602 }
8603
8604 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8605}
8606
8607/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8609 const X86Subtarget &Subtarget,
8610 SelectionDAG &DAG) {
8611 // We need at least 2 non-undef elements to make this worthwhile by default.
8612 unsigned NumNonUndefs =
8613 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8614 if (NumNonUndefs < 2)
8615 return SDValue();
8616
8617 // There are 4 sets of horizontal math operations distinguished by type:
8618 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8619 // subtarget feature. Try to match those "native" patterns first.
8620 MVT VT = BV->getSimpleValueType(0);
8621 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8622 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8623 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8624 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8625 unsigned HOpcode;
8626 SDValue V0, V1;
8627 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8628 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8629 }
8630
8631 // Try harder to match 256-bit ops by using extract/concat.
8632 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8633 return SDValue();
8634
8635 // Count the number of UNDEF operands in the build_vector in input.
8636 unsigned NumElts = VT.getVectorNumElements();
8637 unsigned Half = NumElts / 2;
8638 unsigned NumUndefsLO = 0;
8639 unsigned NumUndefsHI = 0;
8640 for (unsigned i = 0, e = Half; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsLO++;
8643
8644 for (unsigned i = Half, e = NumElts; i != e; ++i)
8645 if (BV->getOperand(i)->isUndef())
8646 NumUndefsHI++;
8647
8648 SDValue InVec0, InVec1;
8649 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8650 SDValue InVec2, InVec3;
8651 unsigned X86Opcode;
8652 bool CanFold = true;
8653
8654 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8655 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8656 InVec3) &&
8657 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8658 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8659 X86Opcode = X86ISD::HADD;
8660 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8661 InVec1) &&
8662 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8663 InVec3) &&
8664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8666 X86Opcode = X86ISD::HSUB;
8667 else
8668 CanFold = false;
8669
8670 if (CanFold) {
8671 // Do not try to expand this build_vector into a pair of horizontal
8672 // add/sub if we can emit a pair of scalar add/sub.
8673 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8674 return SDValue();
8675
8676 // Convert this build_vector into a pair of horizontal binops followed by
8677 // a concat vector. We must adjust the outputs from the partial horizontal
8678 // matching calls above to account for undefined vector halves.
8679 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8680 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8681 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8682 bool isUndefLO = NumUndefsLO == Half;
8683 bool isUndefHI = NumUndefsHI == Half;
8684 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8685 isUndefHI);
8686 }
8687 }
8688
8689 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8690 VT == MVT::v16i16) {
8691 unsigned X86Opcode;
8692 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HADD;
8695 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::HSUB;
8698 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::FHSUB;
8704 else
8705 return SDValue();
8706
8707 // Don't try to expand this build_vector into a pair of horizontal add/sub
8708 // if we can simply emit a pair of scalar add/sub.
8709 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8710 return SDValue();
8711
8712 // Convert this build_vector into two horizontal add/sub followed by
8713 // a concat vector.
8714 bool isUndefLO = NumUndefsLO == Half;
8715 bool isUndefHI = NumUndefsHI == Half;
8716 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8717 isUndefLO, isUndefHI);
8718 }
8719
8720 return SDValue();
8721}
8722
8723static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG);
8725
8726/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8727/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8728/// just apply the bit to the vectors.
8729/// NOTE: Its not in our interest to start make a general purpose vectorizer
8730/// from this, but enough scalar bit operations are created from the later
8731/// legalization + scalarization stages to need basic support.
8733 const X86Subtarget &Subtarget,
8734 SelectionDAG &DAG) {
8735 MVT VT = Op->getSimpleValueType(0);
8736 unsigned NumElems = VT.getVectorNumElements();
8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8738
8739 // Check that all elements have the same opcode.
8740 // TODO: Should we allow UNDEFS and if so how many?
8741 unsigned Opcode = Op->getOperand(0).getOpcode();
8742 for (unsigned i = 1; i < NumElems; ++i)
8743 if (Opcode != Op->getOperand(i).getOpcode())
8744 return SDValue();
8745
8746 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8747 bool IsShift = false;
8748 switch (Opcode) {
8749 default:
8750 return SDValue();
8751 case ISD::SHL:
8752 case ISD::SRL:
8753 case ISD::SRA:
8754 IsShift = true;
8755 break;
8756 case ISD::AND:
8757 case ISD::XOR:
8758 case ISD::OR:
8759 // Don't do this if the buildvector is a splat - we'd replace one
8760 // constant with an entire vector.
8761 if (Op->getSplatValue())
8762 return SDValue();
8763 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8764 return SDValue();
8765 break;
8766 }
8767
8768 SmallVector<SDValue, 4> LHSElts, RHSElts;
8769 for (SDValue Elt : Op->ops()) {
8770 SDValue LHS = Elt.getOperand(0);
8771 SDValue RHS = Elt.getOperand(1);
8772
8773 // We expect the canonicalized RHS operand to be the constant.
8775 return SDValue();
8776
8777 // Extend shift amounts.
8778 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8779 if (!IsShift)
8780 return SDValue();
8781 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8782 }
8783
8784 LHSElts.push_back(LHS);
8785 RHSElts.push_back(RHS);
8786 }
8787
8788 // Limit to shifts by uniform immediates.
8789 // TODO: Only accept vXi8/vXi64 special cases?
8790 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8791 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8792 return SDValue();
8793
8794 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8795 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8796 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8797
8798 if (!IsShift)
8799 return Res;
8800
8801 // Immediately lower the shift to ensure the constant build vector doesn't
8802 // get converted to a constant pool before the shift is lowered.
8803 return LowerShift(Res, Subtarget, DAG);
8804}
8805
8806static bool isShuffleFoldableLoad(SDValue);
8807
8808/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8809/// representing a blend.
8811 X86Subtarget const &Subtarget,
8812 SelectionDAG &DAG) {
8813 MVT VT = BVOp->getSimpleValueType(0u);
8814
8815 if (VT != MVT::v4f64)
8816 return SDValue();
8817
8818 // Collect unique operands.
8819 auto UniqueOps = SmallSet<SDValue, 16u>();
8820 for (SDValue Op : BVOp->ops()) {
8821 if (isIntOrFPConstant(Op) || Op.isUndef())
8822 return SDValue();
8823 UniqueOps.insert(Op);
8824 }
8825
8826 // Candidate BUILD_VECTOR must have 2 unique operands.
8827 if (UniqueOps.size() != 2u)
8828 return SDValue();
8829
8830 SDValue Op0 = BVOp->getOperand(0u);
8831 UniqueOps.erase(Op0);
8832 SDValue Op1 = *UniqueOps.begin();
8833
8834 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8835 isShuffleFoldableLoad(Op1)) {
8836 // Create shuffle mask.
8837 auto const NumElems = VT.getVectorNumElements();
8838 SmallVector<int, 16u> Mask(NumElems);
8839 for (auto I = 0u; I < NumElems; ++I) {
8840 SDValue Op = BVOp->getOperand(I);
8841 Mask[I] = Op == Op0 ? I : I + NumElems;
8842 }
8843 // Create shuffle of splats.
8844 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8845 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8846 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8847 }
8848
8849 return SDValue();
8850}
8851
8852/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8853/// functionality to do this, so it's all zeros, all ones, or some derivation
8854/// that is cheap to calculate.
8856 SelectionDAG &DAG,
8857 const X86Subtarget &Subtarget) {
8858 MVT VT = Op.getSimpleValueType();
8859
8860 // Vectors containing all zeros can be matched by pxor and xorps.
8861 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8862 return Op;
8863
8864 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8865 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8866 // vpcmpeqd on 256-bit vectors.
8867 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8868 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8869 return Op;
8870
8871 return getOnesVector(VT, DAG, DL);
8872 }
8873
8874 return SDValue();
8875}
8876
8877/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8878/// from a vector of source values and a vector of extraction indices.
8879/// The vectors might be manipulated to match the type of the permute op.
8880static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8881 const SDLoc &DL, SelectionDAG &DAG,
8882 const X86Subtarget &Subtarget) {
8883 MVT ShuffleVT = VT;
8884 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8885 unsigned NumElts = VT.getVectorNumElements();
8886 unsigned SizeInBits = VT.getSizeInBits();
8887
8888 // Adjust IndicesVec to match VT size.
8889 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8890 "Illegal variable permute mask size");
8891 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8892 // Narrow/widen the indices vector to the correct size.
8893 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8894 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8895 NumElts * VT.getScalarSizeInBits());
8896 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8897 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8898 SDLoc(IndicesVec), SizeInBits);
8899 // Zero-extend the index elements within the vector.
8900 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8901 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8902 IndicesVT, IndicesVec);
8903 }
8904 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8905
8906 // Handle SrcVec that don't match VT type.
8907 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8908 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8909 // Handle larger SrcVec by treating it as a larger permute.
8910 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8911 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8912 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8913 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8914 Subtarget, DAG, SDLoc(IndicesVec));
8915 SDValue NewSrcVec =
8916 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8917 if (NewSrcVec)
8918 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8919 return SDValue();
8920 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8921 // Widen smaller SrcVec to match VT.
8922 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8923 } else
8924 return SDValue();
8925 }
8926
8927 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8928 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8929 EVT SrcVT = Idx.getValueType();
8930 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8931 uint64_t IndexScale = 0;
8932 uint64_t IndexOffset = 0;
8933
8934 // If we're scaling a smaller permute op, then we need to repeat the
8935 // indices, scaling and offsetting them as well.
8936 // e.g. v4i32 -> v16i8 (Scale = 4)
8937 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8938 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8939 for (uint64_t i = 0; i != Scale; ++i) {
8940 IndexScale |= Scale << (i * NumDstBits);
8941 IndexOffset |= i << (i * NumDstBits);
8942 }
8943
8944 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8945 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8946 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8947 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8948 return Idx;
8949 };
8950
8951 unsigned Opcode = 0;
8952 switch (VT.SimpleTy) {
8953 default:
8954 break;
8955 case MVT::v16i8:
8956 if (Subtarget.hasSSSE3())
8957 Opcode = X86ISD::PSHUFB;
8958 break;
8959 case MVT::v8i16:
8960 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8961 Opcode = X86ISD::VPERMV;
8962 else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v4f32:
8968 case MVT::v4i32:
8969 if (Subtarget.hasAVX()) {
8970 Opcode = X86ISD::VPERMILPV;
8971 ShuffleVT = MVT::v4f32;
8972 } else if (Subtarget.hasSSSE3()) {
8973 Opcode = X86ISD::PSHUFB;
8974 ShuffleVT = MVT::v16i8;
8975 }
8976 break;
8977 case MVT::v2f64:
8978 case MVT::v2i64:
8979 if (Subtarget.hasAVX()) {
8980 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8981 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8982 Opcode = X86ISD::VPERMILPV;
8983 ShuffleVT = MVT::v2f64;
8984 } else if (Subtarget.hasSSE41()) {
8985 // SSE41 can compare v2i64 - select between indices 0 and 1.
8986 return DAG.getSelectCC(
8987 DL, IndicesVec,
8988 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8990 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8992 }
8993 break;
8994 case MVT::v32i8:
8995 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8996 Opcode = X86ISD::VPERMV;
8997 else if (Subtarget.hasXOP()) {
8998 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8999 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9000 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9001 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9002 return DAG.getNode(
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9005 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9006 } else if (Subtarget.hasAVX()) {
9007 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9008 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9009 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9010 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9011 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9013 // Permute Lo and Hi and then select based on index range.
9014 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9015 // care about the bit[7] as its just an index vector.
9016 SDValue Idx = Ops[2];
9017 EVT VT = Idx.getValueType();
9018 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9020 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9022 };
9023 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9024 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9025 PSHUFBBuilder);
9026 }
9027 break;
9028 case MVT::v16i16:
9029 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9030 Opcode = X86ISD::VPERMV;
9031 else if (Subtarget.hasAVX()) {
9032 // Scale to v32i8 and perform as v32i8.
9033 IndicesVec = ScaleIndices(IndicesVec, 2);
9034 return DAG.getBitcast(
9036 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9037 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9038 }
9039 break;
9040 case MVT::v8f32:
9041 case MVT::v8i32:
9042 if (Subtarget.hasAVX2())
9043 Opcode = X86ISD::VPERMV;
9044 else if (Subtarget.hasAVX()) {
9045 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9046 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9047 {0, 1, 2, 3, 0, 1, 2, 3});
9048 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9049 {4, 5, 6, 7, 4, 5, 6, 7});
9050 if (Subtarget.hasXOP())
9051 return DAG.getBitcast(
9052 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9053 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9054 // Permute Lo and Hi and then select based on index range.
9055 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9056 SDValue Res = DAG.getSelectCC(
9057 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9059 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9061 return DAG.getBitcast(VT, Res);
9062 }
9063 break;
9064 case MVT::v4i64:
9065 case MVT::v4f64:
9066 if (Subtarget.hasAVX512()) {
9067 if (!Subtarget.hasVLX()) {
9068 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9069 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9070 SDLoc(SrcVec));
9071 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9072 DAG, SDLoc(IndicesVec));
9073 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9074 DAG, Subtarget);
9075 return extract256BitVector(Res, 0, DAG, DL);
9076 }
9077 Opcode = X86ISD::VPERMV;
9078 } else if (Subtarget.hasAVX()) {
9079 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9080 SDValue LoLo =
9081 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9082 SDValue HiHi =
9083 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9084 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9085 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9086 if (Subtarget.hasXOP())
9087 return DAG.getBitcast(
9088 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9089 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9090 // Permute Lo and Hi and then select based on index range.
9091 // This works as VPERMILPD only uses index bit[1] to permute elements.
9092 SDValue Res = DAG.getSelectCC(
9093 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9095 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9097 return DAG.getBitcast(VT, Res);
9098 }
9099 break;
9100 case MVT::v64i8:
9101 if (Subtarget.hasVBMI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v32i16:
9105 if (Subtarget.hasBWI())
9106 Opcode = X86ISD::VPERMV;
9107 break;
9108 case MVT::v16f32:
9109 case MVT::v16i32:
9110 case MVT::v8f64:
9111 case MVT::v8i64:
9112 if (Subtarget.hasAVX512())
9113 Opcode = X86ISD::VPERMV;
9114 break;
9115 }
9116 if (!Opcode)
9117 return SDValue();
9118
9119 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9120 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9121 "Illegal variable permute shuffle type");
9122
9123 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9124 if (Scale > 1)
9125 IndicesVec = ScaleIndices(IndicesVec, Scale);
9126
9127 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9128 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9129
9130 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9131 SDValue Res = Opcode == X86ISD::VPERMV
9132 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9133 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9134 return DAG.getBitcast(VT, Res);
9135}
9136
9137// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9138// reasoned to be a permutation of a vector by indices in a non-constant vector.
9139// (build_vector (extract_elt V, (extract_elt I, 0)),
9140// (extract_elt V, (extract_elt I, 1)),
9141// ...
9142// ->
9143// (vpermv I, V)
9144//
9145// TODO: Handle undefs
9146// TODO: Utilize pshufb and zero mask blending to support more efficient
9147// construction of vectors with constant-0 elements.
9148static SDValue
9150 SelectionDAG &DAG,
9151 const X86Subtarget &Subtarget) {
9152 SDValue SrcVec, IndicesVec;
9153
9154 auto PeekThroughFreeze = [](SDValue N) {
9155 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9156 return N->getOperand(0);
9157 return N;
9158 };
9159 // Check for a match of the permute source vector and permute index elements.
9160 // This is done by checking that the i-th build_vector operand is of the form:
9161 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9162 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9163 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9164 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9165 return SDValue();
9166
9167 // If this is the first extract encountered in V, set the source vector,
9168 // otherwise verify the extract is from the previously defined source
9169 // vector.
9170 if (!SrcVec)
9171 SrcVec = Op.getOperand(0);
9172 else if (SrcVec != Op.getOperand(0))
9173 return SDValue();
9174 SDValue ExtractedIndex = Op->getOperand(1);
9175 // Peek through extends.
9176 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9177 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9178 ExtractedIndex = ExtractedIndex.getOperand(0);
9179 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9180 return SDValue();
9181
9182 // If this is the first extract from the index vector candidate, set the
9183 // indices vector, otherwise verify the extract is from the previously
9184 // defined indices vector.
9185 if (!IndicesVec)
9186 IndicesVec = ExtractedIndex.getOperand(0);
9187 else if (IndicesVec != ExtractedIndex.getOperand(0))
9188 return SDValue();
9189
9190 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9191 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9192 return SDValue();
9193 }
9194
9195 MVT VT = V.getSimpleValueType();
9196 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9197}
9198
9199SDValue
9200X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9201 SDLoc dl(Op);
9202
9203 MVT VT = Op.getSimpleValueType();
9204 MVT EltVT = VT.getVectorElementType();
9205 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9206 unsigned NumElems = Op.getNumOperands();
9207
9208 // Generate vectors for predicate vectors.
9209 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9210 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9211
9212 if (VT.getVectorElementType() == MVT::bf16 &&
9213 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9214 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9215
9216 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9217 return VectorCst;
9218
9219 unsigned EVTBits = EltVT.getSizeInBits();
9220 APInt UndefMask = APInt::getZero(NumElems);
9221 APInt FrozenUndefMask = APInt::getZero(NumElems);
9222 APInt ZeroMask = APInt::getZero(NumElems);
9223 APInt NonZeroMask = APInt::getZero(NumElems);
9224 bool IsAllConstants = true;
9225 bool OneUseFrozenUndefs = true;
9226 SmallSet<SDValue, 8> Values;
9227 unsigned NumConstants = NumElems;
9228 for (unsigned i = 0; i < NumElems; ++i) {
9229 SDValue Elt = Op.getOperand(i);
9230 if (Elt.isUndef()) {
9231 UndefMask.setBit(i);
9232 continue;
9233 }
9234 if (ISD::isFreezeUndef(Elt.getNode())) {
9235 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9236 FrozenUndefMask.setBit(i);
9237 continue;
9238 }
9239 Values.insert(Elt);
9240 if (!isIntOrFPConstant(Elt)) {
9241 IsAllConstants = false;
9242 NumConstants--;
9243 }
9244 if (X86::isZeroNode(Elt)) {
9245 ZeroMask.setBit(i);
9246 } else {
9247 NonZeroMask.setBit(i);
9248 }
9249 }
9250
9251 // All undef vector. Return an UNDEF.
9252 if (UndefMask.isAllOnes())
9253 return DAG.getUNDEF(VT);
9254
9255 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9256 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9257 return DAG.getFreeze(DAG.getUNDEF(VT));
9258
9259 // All undef/freeze(undef)/zero vector. Return a zero vector.
9260 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9261 return getZeroVector(VT, Subtarget, DAG, dl);
9262
9263 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9264 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9265 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9266 // and blend the FREEZE-UNDEF operands back in.
9267 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9268 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9269 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9270 SmallVector<int, 16> BlendMask(NumElems, -1);
9271 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9272 for (unsigned i = 0; i < NumElems; ++i) {
9273 if (UndefMask[i]) {
9274 BlendMask[i] = -1;
9275 continue;
9276 }
9277 BlendMask[i] = i;
9278 if (!FrozenUndefMask[i])
9279 Elts[i] = Op.getOperand(i);
9280 else
9281 BlendMask[i] += NumElems;
9282 }
9283 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9284 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9285 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9286 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9287 }
9288
9289 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9290
9291 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9292 // be better off lowering to a smaller build vector and padding with
9293 // undef/zero.
9294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9296 unsigned UpperElems = NumElems / 2;
9297 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9299 if (NumUpperUndefsOrZeros >= UpperElems) {
9300 if (VT.is512BitVector() &&
9301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9302 UpperElems = NumElems - (NumElems / 4);
9303 // If freeze(undef) is in any upper elements, force to zero.
9304 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9305 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9306 SDValue NewBV =
9307 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9308 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9309 }
9310 }
9311
9312 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9313 return AddSub;
9314 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9315 return HorizontalOp;
9316 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9317 return Broadcast;
9318 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9319 return BitOp;
9320 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9321 return Blend;
9322
9323 unsigned NumZero = ZeroMask.popcount();
9324 unsigned NumNonZero = NonZeroMask.popcount();
9325
9326 // If we are inserting one variable into a vector of non-zero constants, try
9327 // to avoid loading each constant element as a scalar. Load the constants as a
9328 // vector and then insert the variable scalar element. If insertion is not
9329 // supported, fall back to a shuffle to get the scalar blended with the
9330 // constants. Insertion into a zero vector is handled as a special-case
9331 // somewhere below here.
9332 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9333 FrozenUndefMask.isZero() &&
9336 // Create an all-constant vector. The variable element in the old
9337 // build vector is replaced by undef in the constant vector. Save the
9338 // variable scalar element and its index for use in the insertelement.
9339 LLVMContext &Context = *DAG.getContext();
9340 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9341 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9342 SDValue VarElt;
9343 SDValue InsIndex;
9344 for (unsigned i = 0; i != NumElems; ++i) {
9345 SDValue Elt = Op.getOperand(i);
9346 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9347 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9348 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9349 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9350 else if (!Elt.isUndef()) {
9351 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9352 "Expected one variable element in this vector");
9353 VarElt = Elt;
9354 InsIndex = DAG.getVectorIdxConstant(i, dl);
9355 }
9356 }
9357 Constant *CV = ConstantVector::get(ConstVecOps);
9358 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9359
9360 // The constants we just created may not be legal (eg, floating point). We
9361 // must lower the vector right here because we can not guarantee that we'll
9362 // legalize it before loading it. This is also why we could not just create
9363 // a new build vector here. If the build vector contains illegal constants,
9364 // it could get split back up into a series of insert elements.
9365 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9366 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9367 MachineFunction &MF = DAG.getMachineFunction();
9368 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9369 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9370 unsigned InsertC = InsIndex->getAsZExtVal();
9371 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9372 if (InsertC < NumEltsInLow128Bits)
9373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9374
9375 // There's no good way to insert into the high elements of a >128-bit
9376 // vector, so use shuffles to avoid an extract/insert sequence.
9377 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9378 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9379 SmallVector<int, 8> ShuffleMask;
9380 unsigned NumElts = VT.getVectorNumElements();
9381 for (unsigned i = 0; i != NumElts; ++i)
9382 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9383 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9384 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9385 }
9386
9387 // Special case for single non-zero, non-undef, element.
9388 if (NumNonZero == 1) {
9389 unsigned Idx = NonZeroMask.countr_zero();
9390 SDValue Item = Op.getOperand(Idx);
9391
9392 // If we have a constant or non-constant insertion into the low element of
9393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9394 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9395 // depending on what the source datatype is.
9396 if (Idx == 0) {
9397 if (NumZero == 0)
9398 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9399
9400 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9401 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9402 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9403 assert((VT.is128BitVector() || VT.is256BitVector() ||
9404 VT.is512BitVector()) &&
9405 "Expected an SSE value type!");
9406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9407 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9408 // zero vector.
9409 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9410 }
9411
9412 // We can't directly insert an i8 or i16 into a vector, so zero extend
9413 // it to i32 first.
9414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9415 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9419 return DAG.getBitcast(VT, Item);
9420 }
9421 }
9422
9423 // Is it a vector logical left shift?
9424 if (NumElems == 2 && Idx == 1 &&
9425 X86::isZeroNode(Op.getOperand(0)) &&
9426 !X86::isZeroNode(Op.getOperand(1))) {
9427 unsigned NumBits = VT.getSizeInBits();
9428 return getVShift(true, VT,
9430 VT, Op.getOperand(1)),
9431 NumBits/2, DAG, *this, dl);
9432 }
9433
9434 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9435 return SDValue();
9436
9437 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9438 // is a non-constant being inserted into an element other than the low one,
9439 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9440 // movd/movss) to move this into the low element, then shuffle it into
9441 // place.
9442 if (EVTBits == 32) {
9443 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9444 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9445 }
9446 }
9447
9448 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9449 if (Values.size() == 1) {
9450 if (EVTBits == 32) {
9451 // Instead of a shuffle like this:
9452 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9453 // Check if it's possible to issue this instead.
9454 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9455 unsigned Idx = NonZeroMask.countr_zero();
9456 SDValue Item = Op.getOperand(Idx);
9457 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9458 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9459 }
9460 return SDValue();
9461 }
9462
9463 // A vector full of immediates; various special cases are already
9464 // handled, so this is best done with a single constant-pool load.
9465 if (IsAllConstants)
9466 return SDValue();
9467
9468 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9469 return V;
9470
9471 // See if we can use a vector load to get all of the elements.
9472 {
9473 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9474 if (SDValue LD =
9475 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9476 return LD;
9477 }
9478
9479 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9480 // build_vector and broadcast it.
9481 // TODO: We could probably generalize this more.
9482 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9483 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9484 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9485 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9486 // Make sure all the even/odd operands match.
9487 for (unsigned i = 2; i != NumElems; ++i)
9488 if (Ops[i % 2] != Op.getOperand(i))
9489 return false;
9490 return true;
9491 };
9492 if (CanSplat(Op, NumElems, Ops)) {
9493 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9494 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9495 // Create a new build vector and cast to v2i64/v2f64.
9496 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9497 DAG.getBuildVector(NarrowVT, dl, Ops));
9498 // Broadcast from v2i64/v2f64 and cast to final VT.
9499 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9500 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9501 NewBV));
9502 }
9503 }
9504
9505 // For AVX-length vectors, build the individual 128-bit pieces and use
9506 // shuffles to put them in place.
9507 if (VT.getSizeInBits() > 128) {
9508 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9509
9510 // Build both the lower and upper subvector.
9511 SDValue Lower =
9512 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9514 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9515
9516 // Recreate the wider vector with the lower and upper part.
9517 return concatSubVectors(Lower, Upper, DAG, dl);
9518 }
9519
9520 // Let legalizer expand 2-wide build_vectors.
9521 if (EVTBits == 64) {
9522 if (NumNonZero == 1) {
9523 // One half is zero or undef.
9524 unsigned Idx = NonZeroMask.countr_zero();
9525 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9526 Op.getOperand(Idx));
9527 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9528 }
9529 return SDValue();
9530 }
9531
9532 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9533 if (EVTBits == 8 && NumElems == 16)
9534 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9535 NumZero, DAG, Subtarget))
9536 return V;
9537
9538 if (EltVT == MVT::i16 && NumElems == 8)
9539 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9540 NumZero, DAG, Subtarget))
9541 return V;
9542
9543 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9544 if (EVTBits == 32 && NumElems == 4)
9545 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9546 return V;
9547
9548 // If element VT is == 32 bits, turn it into a number of shuffles.
9549 if (NumElems == 4 && NumZero > 0) {
9550 SmallVector<SDValue, 8> Ops(NumElems);
9551 for (unsigned i = 0; i < 4; ++i) {
9552 bool isZero = !NonZeroMask[i];
9553 if (isZero)
9554 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9555 else
9556 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9557 }
9558
9559 for (unsigned i = 0; i < 2; ++i) {
9560 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9561 default: llvm_unreachable("Unexpected NonZero count");
9562 case 0:
9563 Ops[i] = Ops[i*2]; // Must be a zero vector.
9564 break;
9565 case 1:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9567 break;
9568 case 2:
9569 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 case 3:
9572 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9573 break;
9574 }
9575 }
9576
9577 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9578 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9579 int MaskVec[] = {
9580 Reverse1 ? 1 : 0,
9581 Reverse1 ? 0 : 1,
9582 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9583 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9584 };
9585 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9586 }
9587
9588 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9589
9590 // Check for a build vector from mostly shuffle plus few inserting.
9591 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9592 return Sh;
9593
9594 // For SSE 4.1, use insertps to put the high elements into the low element.
9595 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9597 if (!Op.getOperand(0).isUndef())
9598 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9599 else
9600 Result = DAG.getUNDEF(VT);
9601
9602 for (unsigned i = 1; i < NumElems; ++i) {
9603 if (Op.getOperand(i).isUndef()) continue;
9604 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9605 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9606 }
9607 return Result;
9608 }
9609
9610 // Otherwise, expand into a number of unpckl*, start by extending each of
9611 // our (non-undef) elements to the full vector width with the element in the
9612 // bottom slot of the vector (which generates no code for SSE).
9613 SmallVector<SDValue, 8> Ops(NumElems);
9614 for (unsigned i = 0; i < NumElems; ++i) {
9615 if (!Op.getOperand(i).isUndef())
9616 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9617 else
9618 Ops[i] = DAG.getUNDEF(VT);
9619 }
9620
9621 // Next, we iteratively mix elements, e.g. for v4f32:
9622 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9623 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9624 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9625 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9626 // Generate scaled UNPCKL shuffle mask.
9627 SmallVector<int, 16> Mask;
9628 for(unsigned i = 0; i != Scale; ++i)
9629 Mask.push_back(i);
9630 for (unsigned i = 0; i != Scale; ++i)
9631 Mask.push_back(NumElems+i);
9632 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9633
9634 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9635 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9636 }
9637 return Ops[0];
9638}
9639
9640// 256-bit AVX can use the vinsertf128 instruction
9641// to create 256-bit vectors from two other 128-bit ones.
9642// TODO: Detect subvector broadcast here instead of DAG combine?
9644 SelectionDAG &DAG,
9645 const X86Subtarget &Subtarget) {
9646 MVT ResVT = Op.getSimpleValueType();
9647 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9648 "Value type must be 256-/512-bit wide");
9649
9650 unsigned NumOperands = Op.getNumOperands();
9651 unsigned NumFreezeUndef = 0;
9652 unsigned NumZero = 0;
9653 unsigned NumNonZero = 0;
9654 unsigned NonZeros = 0;
9655 SmallSet<SDValue, 4> Undefs;
9656 for (unsigned i = 0; i != NumOperands; ++i) {
9657 SDValue SubVec = Op.getOperand(i);
9658 if (SubVec.isUndef())
9659 continue;
9660 if (ISD::isFreezeUndef(SubVec.getNode())) {
9661 // If the freeze(undef) has multiple uses then we must fold to zero.
9662 if (SubVec.hasOneUse()) {
9663 ++NumFreezeUndef;
9664 } else {
9665 ++NumZero;
9666 Undefs.insert(SubVec);
9667 }
9668 }
9669 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9670 ++NumZero;
9671 else {
9672 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9673 NonZeros |= 1 << i;
9674 ++NumNonZero;
9675 }
9676 }
9677
9678 // If we have more than 2 non-zeros, build each half separately.
9679 if (NumNonZero > 2) {
9680 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9681 ArrayRef<SDUse> Ops = Op->ops();
9682 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9683 Ops.slice(0, NumOperands/2));
9684 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9685 Ops.slice(NumOperands/2));
9686 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9687 }
9688
9689 // Otherwise, build it up through insert_subvectors.
9690 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9691 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9692 : DAG.getUNDEF(ResVT));
9693
9694 // Replace Undef operands with ZeroVector.
9695 for (SDValue U : Undefs)
9697 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9698
9699 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9700 unsigned NumSubElems = SubVT.getVectorNumElements();
9701 for (unsigned i = 0; i != NumOperands; ++i) {
9702 if ((NonZeros & (1 << i)) == 0)
9703 continue;
9704
9705 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9706 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9707 }
9708
9709 return Vec;
9710}
9711
9712// Returns true if the given node is a type promotion (by concatenating i1
9713// zeros) of the result of a node that already zeros all upper bits of
9714// k-register.
9715// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG & DAG) {
9719 MVT ResVT = Op.getSimpleValueType();
9720 unsigned NumOperands = Op.getNumOperands();
9721 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9722 "Unexpected number of operands in CONCAT_VECTORS");
9723
9724 uint64_t Zeros = 0;
9725 uint64_t NonZeros = 0;
9726 for (unsigned i = 0; i != NumOperands; ++i) {
9727 SDValue SubVec = Op.getOperand(i);
9728 if (SubVec.isUndef())
9729 continue;
9730 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9731 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9732 Zeros |= (uint64_t)1 << i;
9733 else
9734 NonZeros |= (uint64_t)1 << i;
9735 }
9736
9737 unsigned NumElems = ResVT.getVectorNumElements();
9738
9739 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9740 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9741 // insert_subvector will give us two kshifts.
9742 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9743 Log2_64(NonZeros) != NumOperands - 1) {
9744 unsigned Idx = Log2_64(NonZeros);
9745 SDValue SubVec = Op.getOperand(Idx);
9746 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9747 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9748 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9749 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9750 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9751 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9752 DAG.getVectorIdxConstant(0, dl));
9753 }
9754
9755 // If there are zero or one non-zeros we can handle this very simply.
9756 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9757 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9758 if (!NonZeros)
9759 return Vec;
9760 unsigned Idx = Log2_64(NonZeros);
9761 SDValue SubVec = Op.getOperand(Idx);
9762 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9763 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9764 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9765 }
9766
9767 if (NumOperands > 2) {
9768 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9769 ArrayRef<SDUse> Ops = Op->ops();
9770 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9771 Ops.slice(0, NumOperands / 2));
9772 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9773 Ops.slice(NumOperands / 2));
9774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9775 }
9776
9777 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9778
9779 if (ResVT.getVectorNumElements() >= 16)
9780 return Op; // The operation is legal with KUNPCK
9781
9782 SDValue Vec =
9783 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9784 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9785 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9786 DAG.getVectorIdxConstant(NumElems / 2, dl));
9787}
9788
9790 const X86Subtarget &Subtarget,
9791 SelectionDAG &DAG) {
9792 SDLoc DL(Op);
9793 MVT VT = Op.getSimpleValueType();
9794 if (VT.getVectorElementType() == MVT::i1)
9795 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9796
9797 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9798 // from two other 128-bit ones.
9799 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9800 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9801 (VT.is512BitVector() &&
9802 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9803 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9804}
9805
9806//===----------------------------------------------------------------------===//
9807// Vector shuffle lowering
9808//
9809// This is an experimental code path for lowering vector shuffles on x86. It is
9810// designed to handle arbitrary vector shuffles and blends, gracefully
9811// degrading performance as necessary. It works hard to recognize idiomatic
9812// shuffles and lower them to optimal instruction patterns without leaving
9813// a framework that allows reasonably efficient handling of all vector shuffle
9814// patterns.
9815//===----------------------------------------------------------------------===//
9816
9817/// Checks whether the vector elements referenced by two shuffle masks are
9818/// equivalent.
9819static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9820 int Idx, int ExpectedIdx) {
9821 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9822 ExpectedIdx < MaskSize && "Out of range element index");
9823 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9824 return false;
9825
9826 EVT VT = Op.getValueType();
9827 EVT ExpectedVT = ExpectedOp.getValueType();
9828
9829 // Sources must be vectors and match the mask's element count.
9830 if (!VT.isVector() || !ExpectedVT.isVector() ||
9831 (int)VT.getVectorNumElements() != MaskSize ||
9832 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9833 return false;
9834
9835 // Exact match.
9836 if (Idx == ExpectedIdx && Op == ExpectedOp)
9837 return true;
9838
9839 switch (Op.getOpcode()) {
9840 case ISD::BUILD_VECTOR:
9841 // If the values are build vectors, we can look through them to find
9842 // equivalent inputs that make the shuffles equivalent.
9843 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9844 case ISD::BITCAST: {
9846 EVT SrcVT = Src.getValueType();
9847 if (Op == ExpectedOp && SrcVT.isVector()) {
9848 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9849 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9850 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9851 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9852 Idx / Scale, ExpectedIdx / Scale);
9853 }
9854 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9856 for (unsigned I = 0; I != Scale; ++I)
9857 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 (Idx * Scale) + I,
9859 (ExpectedIdx * Scale) + I))
9860 return false;
9861 return true;
9862 }
9863 }
9864 break;
9865 }
9866 case ISD::VECTOR_SHUFFLE: {
9867 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9868 return Op == ExpectedOp &&
9869 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9870 }
9871 case X86ISD::VBROADCAST:
9873 return Op == ExpectedOp;
9875 if (Op == ExpectedOp) {
9876 auto *MemOp = cast<MemSDNode>(Op);
9877 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9878 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9879 }
9880 break;
9881 case X86ISD::VPERMI: {
9882 if (Op == ExpectedOp) {
9884 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9885 SDValue Src = Op.getOperand(0);
9886 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9887 Mask[ExpectedIdx]);
9888 }
9889 break;
9890 }
9891 case X86ISD::HADD:
9892 case X86ISD::HSUB:
9893 case X86ISD::FHADD:
9894 case X86ISD::FHSUB:
9895 case X86ISD::PACKSS:
9896 case X86ISD::PACKUS:
9897 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9898 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9899 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9900 int NumElts = VT.getVectorNumElements();
9901 int NumLanes = VT.getSizeInBits() / 128;
9902 int NumEltsPerLane = NumElts / NumLanes;
9903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9904 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9905 bool SameElt =
9906 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9907 return SameLane && SameElt;
9908 }
9909 break;
9910 }
9911
9912 return false;
9913}
9914
9915/// Tiny helper function to identify a no-op mask.
9916///
9917/// This is a somewhat boring predicate function. It checks whether the mask
9918/// array input, which is assumed to be a single-input shuffle mask of the kind
9919/// used by the X86 shuffle instructions (not a fully general
9920/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9921/// in-place shuffle are 'no-op's.
9923 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9924 assert(Mask[i] >= -1 && "Out of bound mask element!");
9925 if (Mask[i] >= 0 && Mask[i] != i)
9926 return false;
9927 }
9928 return true;
9929}
9930
9931/// Test whether there are elements crossing LaneSizeInBits lanes in this
9932/// shuffle mask.
9933///
9934/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9935/// and we routinely test for these.
9936static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9937 unsigned ScalarSizeInBits,
9938 ArrayRef<int> Mask) {
9939 assert(LaneSizeInBits && ScalarSizeInBits &&
9940 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9941 "Illegal shuffle lane size");
9942 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9943 int Size = Mask.size();
9944 for (int i = 0; i < Size; ++i)
9945 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9946 return true;
9947 return false;
9948}
9949
9950/// Test whether there are elements crossing 128-bit lanes in this
9951/// shuffle mask.
9953 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9954}
9955
9956/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9957/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9958/// better support 'repeated mask + lane permute' style shuffles.
9959static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9960 unsigned ScalarSizeInBits,
9961 ArrayRef<int> Mask) {
9962 assert(LaneSizeInBits && ScalarSizeInBits &&
9963 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9964 "Illegal shuffle lane size");
9965 int NumElts = Mask.size();
9966 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9967 int NumLanes = NumElts / NumEltsPerLane;
9968 if (NumLanes > 1) {
9969 for (int i = 0; i != NumLanes; ++i) {
9970 int SrcLane = -1;
9971 for (int j = 0; j != NumEltsPerLane; ++j) {
9972 int M = Mask[(i * NumEltsPerLane) + j];
9973 if (M < 0)
9974 continue;
9975 int Lane = (M % NumElts) / NumEltsPerLane;
9976 if (SrcLane >= 0 && SrcLane != Lane)
9977 return true;
9978 SrcLane = Lane;
9979 }
9980 }
9981 }
9982 return false;
9983}
9984
9985/// Test whether a shuffle mask is equivalent within each sub-lane.
9986///
9987/// This checks a shuffle mask to see if it is performing the same
9988/// lane-relative shuffle in each sub-lane. This trivially implies
9989/// that it is also not lane-crossing. It may however involve a blend from the
9990/// same lane of a second vector.
9991///
9992/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9993/// non-trivial to compute in the face of undef lanes. The representation is
9994/// suitable for use with existing 128-bit shuffles as entries from the second
9995/// vector have been remapped to [LaneSize, 2*LaneSize).
9996static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9997 ArrayRef<int> Mask,
9998 SmallVectorImpl<int> &RepeatedMask) {
9999 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10000 RepeatedMask.assign(LaneSize, -1);
10001 int Size = Mask.size();
10002 for (int i = 0; i < Size; ++i) {
10003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10004 if (Mask[i] < 0)
10005 continue;
10006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10007 // This entry crosses lanes, so there is no way to model this shuffle.
10008 return false;
10009
10010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10011 // Adjust second vector indices to start at LaneSize instead of Size.
10012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10013 : Mask[i] % LaneSize + LaneSize;
10014 if (RepeatedMask[i % LaneSize] < 0)
10015 // This is the first non-undef entry in this slot of a 128-bit lane.
10016 RepeatedMask[i % LaneSize] = LocalM;
10017 else if (RepeatedMask[i % LaneSize] != LocalM)
10018 // Found a mismatch with the repeated mask.
10019 return false;
10020 }
10021 return true;
10022}
10023
10024/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10025static bool
10027 SmallVectorImpl<int> &RepeatedMask) {
10028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10029}
10030
10031static bool
10033 SmallVector<int, 32> RepeatedMask;
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10038static bool
10040 SmallVectorImpl<int> &RepeatedMask) {
10041 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10042}
10043
10044/// Test whether a target shuffle mask is equivalent within each sub-lane.
10045/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10046static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10047 unsigned EltSizeInBits,
10048 ArrayRef<int> Mask,
10049 SmallVectorImpl<int> &RepeatedMask) {
10050 int LaneSize = LaneSizeInBits / EltSizeInBits;
10051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10052 int Size = Mask.size();
10053 for (int i = 0; i < Size; ++i) {
10054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10055 if (Mask[i] == SM_SentinelUndef)
10056 continue;
10057 if (Mask[i] == SM_SentinelZero) {
10058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10059 return false;
10060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10061 continue;
10062 }
10063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10064 // This entry crosses lanes, so there is no way to model this shuffle.
10065 return false;
10066
10067 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10068 // later vector indices to start at multiples of LaneSize instead of Size.
10069 int LaneM = Mask[i] / Size;
10070 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10072 // This is the first non-undef entry in this slot of a 128-bit lane.
10073 RepeatedMask[i % LaneSize] = LocalM;
10074 else if (RepeatedMask[i % LaneSize] != LocalM)
10075 // Found a mismatch with the repeated mask.
10076 return false;
10077 }
10078 return true;
10079}
10080
10081/// Test whether a target shuffle mask is equivalent within each sub-lane.
10082/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10083static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10084 ArrayRef<int> Mask,
10085 SmallVectorImpl<int> &RepeatedMask) {
10086 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10087 Mask, RepeatedMask);
10088}
10089
10090/// Checks whether a shuffle mask is equivalent to an explicit list of
10091/// arguments.
10092///
10093/// This is a fast way to test a shuffle mask against a fixed pattern:
10094///
10095/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10096///
10097/// It returns true if the mask is exactly as wide as the argument list, and
10098/// each element of the mask is either -1 (signifying undef) or the value given
10099/// in the argument.
10100static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10101 SDValue V1 = SDValue(),
10102 SDValue V2 = SDValue()) {
10103 int Size = Mask.size();
10104 if (Size != (int)ExpectedMask.size())
10105 return false;
10106
10107 for (int i = 0; i < Size; ++i) {
10108 assert(Mask[i] >= -1 && "Out of bound mask element!");
10109 int MaskIdx = Mask[i];
10110 int ExpectedIdx = ExpectedMask[i];
10111 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10112 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10113 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10114 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10115 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10116 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10117 return false;
10118 }
10119 }
10120 return true;
10121}
10122
10123/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10124///
10125/// The masks must be exactly the same width.
10126///
10127/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10128/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10129///
10130/// SM_SentinelZero is accepted as a valid negative index but must match in
10131/// both, or via a known bits test.
10133 ArrayRef<int> ExpectedMask,
10134 const SelectionDAG &DAG,
10135 SDValue V1 = SDValue(),
10136 SDValue V2 = SDValue()) {
10137 int Size = Mask.size();
10138 if (Size != (int)ExpectedMask.size())
10139 return false;
10140 assert(llvm::all_of(ExpectedMask,
10141 [Size](int M) {
10142 return M == SM_SentinelZero ||
10143 isInRange(M, 0, 2 * Size);
10144 }) &&
10145 "Illegal target shuffle mask");
10146
10147 // Check for out-of-range target shuffle mask indices.
10148 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10149 return false;
10150
10151 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10152 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V1.getValueType().isVector()))
10154 V1 = SDValue();
10155 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10156 !V2.getValueType().isVector()))
10157 V2 = SDValue();
10158
10159 APInt ZeroV1 = APInt::getZero(Size);
10160 APInt ZeroV2 = APInt::getZero(Size);
10161
10162 for (int i = 0; i < Size; ++i) {
10163 int MaskIdx = Mask[i];
10164 int ExpectedIdx = ExpectedMask[i];
10165 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10166 continue;
10167 // If we failed to match an expected SM_SentinelZero then early out.
10168 if (ExpectedIdx < 0)
10169 return false;
10170 if (MaskIdx == SM_SentinelZero) {
10171 // If we need this expected index to be a zero element, then update the
10172 // relevant zero mask and perform the known bits at the end to minimize
10173 // repeated computes.
10174 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10175 if (ExpectedV &&
10176 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10177 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10179 ZeroMask.setBit(BitIdx);
10180 continue;
10181 }
10182 }
10183 if (MaskIdx >= 0) {
10184 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10185 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10186 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10187 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10188 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10189 continue;
10190 }
10191 return false;
10192 }
10193 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10194 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10195}
10196
10197// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10198// instructions.
10200 const SelectionDAG &DAG) {
10201 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10202 return false;
10203
10204 SmallVector<int, 8> Unpcklwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10206 /* Unary = */ false);
10207 SmallVector<int, 8> Unpckhwd;
10208 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10209 /* Unary = */ false);
10210 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10211 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10212 return IsUnpackwdMask;
10213}
10214
10216 const SelectionDAG &DAG) {
10217 // Create 128-bit vector type based on mask size.
10218 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10219 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10220
10221 // We can't assume a canonical shuffle mask, so try the commuted version too.
10222 SmallVector<int, 4> CommutedMask(Mask);
10224
10225 // Match any of unary/binary or low/high.
10226 for (unsigned i = 0; i != 4; ++i) {
10227 SmallVector<int, 16> UnpackMask;
10228 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10229 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10230 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10231 return true;
10232 }
10233 return false;
10234}
10235
10236/// Return true if a shuffle mask chooses elements identically in its top and
10237/// bottom halves. For example, any splat mask has the same top and bottom
10238/// halves. If an element is undefined in only one half of the mask, the halves
10239/// are not considered identical.
10241 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10242 unsigned HalfSize = Mask.size() / 2;
10243 for (unsigned i = 0; i != HalfSize; ++i) {
10244 if (Mask[i] != Mask[i + HalfSize])
10245 return false;
10246 }
10247 return true;
10248}
10249
10250/// Get a 4-lane 8-bit shuffle immediate for a mask.
10251///
10252/// This helper function produces an 8-bit shuffle immediate corresponding to
10253/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10254/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10255/// example.
10256///
10257/// NB: We rely heavily on "undef" masks preserving the input lane.
10258static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10259 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10260 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10261 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10262 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10263 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10264
10265 // If the mask only uses one non-undef element, then fully 'splat' it to
10266 // improve later broadcast matching.
10267 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10268 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10269
10270 int FirstElt = Mask[FirstIndex];
10271 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10272 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10273
10274 unsigned Imm = 0;
10275 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10276 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10277 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10278 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10279 return Imm;
10280}
10281
10283 SelectionDAG &DAG) {
10284 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10285}
10286
10287// Canonicalize SHUFPD mask to improve chances of further folding.
10288// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10289static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10290 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10291 "Unexpected SHUFPD mask size");
10292 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10293 "Unexpected SHUFPD mask elements");
10294
10295 // If the mask only uses one non-undef element, then fully 'splat' it to
10296 // improve later broadcast matching.
10297 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10298 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10299 "All undef shuffle mask");
10300
10301 int FirstElt = Mask[FirstIndex];
10302 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10303 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10304 unsigned Imm = 0;
10305 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10306 Imm |= FirstElt << I;
10307 return Imm;
10308 }
10309
10310 // Attempt to keep any undef elements in place to improve chances of the
10311 // shuffle becoming a (commutative) blend.
10312 unsigned Imm = 0;
10313 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10314 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10315
10316 return Imm;
10317}
10318
10320 SelectionDAG &DAG) {
10321 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10322}
10323
10324// The Shuffle result is as follow:
10325// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10326// Each Zeroable's element correspond to a particular Mask's element.
10327// As described in computeZeroableShuffleElements function.
10328//
10329// The function looks for a sub-mask that the nonzero elements are in
10330// increasing order. If such sub-mask exist. The function returns true.
10331static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10332 ArrayRef<int> Mask, const EVT &VectorType,
10333 bool &IsZeroSideLeft) {
10334 int NextElement = -1;
10335 // Check if the Mask's nonzero elements are in increasing order.
10336 for (int i = 0, e = Mask.size(); i < e; i++) {
10337 // Checks if the mask's zeros elements are built from only zeros.
10338 assert(Mask[i] >= -1 && "Out of bound mask element!");
10339 if (Mask[i] < 0)
10340 return false;
10341 if (Zeroable[i])
10342 continue;
10343 // Find the lowest non zero element
10344 if (NextElement < 0) {
10345 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10346 IsZeroSideLeft = NextElement != 0;
10347 }
10348 // Exit if the mask's non zero elements are not in increasing order.
10349 if (NextElement != Mask[i])
10350 return false;
10351 NextElement++;
10352 }
10353 return true;
10354}
10355
10356static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10358 const X86Subtarget &Subtarget,
10359 unsigned Depth = 0);
10360
10361/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10363 ArrayRef<int> Mask, SDValue V1,
10364 SDValue V2, const APInt &Zeroable,
10365 const X86Subtarget &Subtarget,
10366 SelectionDAG &DAG) {
10367 int Size = Mask.size();
10368 int LaneSize = 128 / VT.getScalarSizeInBits();
10369 const int NumBytes = VT.getSizeInBits() / 8;
10370 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10371
10372 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10373 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10374 (Subtarget.hasBWI() && VT.is512BitVector()));
10375
10376 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10377 // Sign bit set in i8 mask means zero element.
10378 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10379
10380 SDValue V;
10381 for (int i = 0; i < NumBytes; ++i) {
10382 int M = Mask[i / NumEltBytes];
10383 if (M < 0) {
10384 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10385 continue;
10386 }
10387 if (Zeroable[i / NumEltBytes]) {
10388 PSHUFBMask[i] = ZeroMask;
10389 continue;
10390 }
10391
10392 // We can only use a single input of V1 or V2.
10393 SDValue SrcV = (M >= Size ? V2 : V1);
10394 if (V && V != SrcV)
10395 return SDValue();
10396 V = SrcV;
10397 M %= Size;
10398
10399 // PSHUFB can't cross lanes, ensure this doesn't happen.
10400 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10401 return SDValue();
10402
10403 M = M % LaneSize;
10404 M = M * NumEltBytes + (i % NumEltBytes);
10405 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10406 }
10407 assert(V && "Failed to find a source input");
10408
10409 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10410 return DAG.getBitcast(
10411 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10412 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10413}
10414
10415static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10416 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10417 const SDLoc &dl);
10418
10419// X86 has dedicated shuffle that can be lowered to VEXPAND
10421 SDValue V2, ArrayRef<int> Mask,
10422 const APInt &Zeroable,
10423 const X86Subtarget &Subtarget,
10424 SelectionDAG &DAG) {
10425 bool IsLeftZeroSide = true;
10426 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10427 IsLeftZeroSide))
10428 return SDValue();
10429 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10431 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10432 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10433 unsigned NumElts = VT.getVectorNumElements();
10434 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10435 "Unexpected number of vector elements");
10436 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10437 Subtarget, DAG, DL);
10438 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10439 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10440 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10441}
10442
10443static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10444 unsigned &UnpackOpcode, bool IsUnary,
10445 ArrayRef<int> TargetMask, const SDLoc &DL,
10446 SelectionDAG &DAG,
10447 const X86Subtarget &Subtarget) {
10448 int NumElts = VT.getVectorNumElements();
10449
10450 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10451 for (int i = 0; i != NumElts; i += 2) {
10452 int M1 = TargetMask[i + 0];
10453 int M2 = TargetMask[i + 1];
10454 Undef1 &= (SM_SentinelUndef == M1);
10455 Undef2 &= (SM_SentinelUndef == M2);
10456 Zero1 &= isUndefOrZero(M1);
10457 Zero2 &= isUndefOrZero(M2);
10458 }
10459 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10460 "Zeroable shuffle detected");
10461
10462 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10463 SmallVector<int, 64> Unpckl, Unpckh;
10464 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10465 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10466 (IsUnary ? V1 : V2))) {
10467 UnpackOpcode = X86ISD::UNPCKL;
10468 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10469 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10470 return true;
10471 }
10472
10473 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10474 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10475 (IsUnary ? V1 : V2))) {
10476 UnpackOpcode = X86ISD::UNPCKH;
10477 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10478 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10479 return true;
10480 }
10481
10482 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10483 if (IsUnary && (Zero1 || Zero2)) {
10484 // Don't bother if we can blend instead.
10485 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10486 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10487 return false;
10488
10489 bool MatchLo = true, MatchHi = true;
10490 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10491 int M = TargetMask[i];
10492
10493 // Ignore if the input is known to be zero or the index is undef.
10494 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10495 (M == SM_SentinelUndef))
10496 continue;
10497
10498 MatchLo &= (M == Unpckl[i]);
10499 MatchHi &= (M == Unpckh[i]);
10500 }
10501
10502 if (MatchLo || MatchHi) {
10503 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10504 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10506 return true;
10507 }
10508 }
10509
10510 // If a binary shuffle, commute and try again.
10511 if (!IsUnary) {
10513 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10514 UnpackOpcode = X86ISD::UNPCKL;
10515 std::swap(V1, V2);
10516 return true;
10517 }
10518
10520 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10521 UnpackOpcode = X86ISD::UNPCKH;
10522 std::swap(V1, V2);
10523 return true;
10524 }
10525 }
10526
10527 return false;
10528}
10529
10530// X86 has dedicated unpack instructions that can handle specific blend
10531// operations: UNPCKH and UNPCKL.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 SmallVector<int, 8> Unpckl;
10536 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10539
10540 SmallVector<int, 8> Unpckh;
10541 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10542 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10544
10545 // Commute and try again.
10547 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10549
10551 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10553
10554 return SDValue();
10555}
10556
10557/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10558/// followed by unpack 256-bit.
10560 SDValue V2, ArrayRef<int> Mask,
10561 SelectionDAG &DAG) {
10562 SmallVector<int, 32> Unpckl, Unpckh;
10563 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10564 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10565
10566 unsigned UnpackOpcode;
10567 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10568 UnpackOpcode = X86ISD::UNPCKL;
10569 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10570 UnpackOpcode = X86ISD::UNPCKH;
10571 else
10572 return SDValue();
10573
10574 // This is a "natural" unpack operation (rather than the 128-bit sectored
10575 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10576 // input in order to use the x86 instruction.
10577 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10578 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10579 V1 = DAG.getBitcast(VT, V1);
10580 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10581}
10582
10583// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10584// source into the lower elements and zeroing the upper elements.
10585static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10586 ArrayRef<int> Mask, const APInt &Zeroable,
10587 const X86Subtarget &Subtarget) {
10588 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10589 return false;
10590
10591 unsigned NumElts = Mask.size();
10592 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10593 unsigned MaxScale = 64 / EltSizeInBits;
10594
10595 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10596 unsigned SrcEltBits = EltSizeInBits * Scale;
10597 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10598 continue;
10599 unsigned NumSrcElts = NumElts / Scale;
10600 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10601 continue;
10602 unsigned UpperElts = NumElts - NumSrcElts;
10603 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10604 continue;
10605 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10606 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10607 DstVT = MVT::getIntegerVT(EltSizeInBits);
10608 if ((NumSrcElts * EltSizeInBits) >= 128) {
10609 // ISD::TRUNCATE
10610 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10611 } else {
10612 // X86ISD::VTRUNC
10613 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10614 }
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10621// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10622// element padding to the final DstVT.
10623static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10624 const X86Subtarget &Subtarget,
10625 SelectionDAG &DAG, bool ZeroUppers) {
10626 MVT SrcVT = Src.getSimpleValueType();
10627 MVT DstSVT = DstVT.getScalarType();
10628 unsigned NumDstElts = DstVT.getVectorNumElements();
10629 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10630 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10631
10632 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10633 return SDValue();
10634
10635 // Perform a direct ISD::TRUNCATE if possible.
10636 if (NumSrcElts == NumDstElts)
10637 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10638
10639 if (NumSrcElts > NumDstElts) {
10640 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10641 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10642 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10643 }
10644
10645 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10649 DstVT.getSizeInBits());
10650 }
10651
10652 // Non-VLX targets must truncate from a 512-bit type, so we need to
10653 // widen, truncate and then possibly extract the original subvector.
10654 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10655 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10656 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10657 }
10658
10659 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10660 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10661 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10662 if (DstVT != TruncVT)
10663 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10664 DstVT.getSizeInBits());
10665 return Trunc;
10666}
10667
10668// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10669//
10670// An example is the following:
10671//
10672// t0: ch = EntryToken
10673// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10674// t25: v4i32 = truncate t2
10675// t41: v8i16 = bitcast t25
10676// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10677// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10678// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10679// t18: v2i64 = bitcast t51
10680//
10681// One can just use a single vpmovdw instruction, without avx512vl we need to
10682// use the zmm variant and extract the lower subvector, padding with zeroes.
10683// TODO: Merge with lowerShuffleAsVTRUNC.
10685 SDValue V2, ArrayRef<int> Mask,
10686 const APInt &Zeroable,
10687 const X86Subtarget &Subtarget,
10688 SelectionDAG &DAG) {
10689 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10690 if (!Subtarget.hasAVX512())
10691 return SDValue();
10692
10693 unsigned NumElts = VT.getVectorNumElements();
10694 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10695 unsigned MaxScale = 64 / EltSizeInBits;
10696 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10697 unsigned SrcEltBits = EltSizeInBits * Scale;
10698 unsigned NumSrcElts = NumElts / Scale;
10699 unsigned UpperElts = NumElts - NumSrcElts;
10700 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10701 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10702 continue;
10703
10704 // Attempt to find a matching source truncation, but as a fall back VLX
10705 // cases can use the VPMOV directly.
10706 SDValue Src = peekThroughBitcasts(V1);
10707 if (Src.getOpcode() == ISD::TRUNCATE &&
10708 Src.getScalarValueSizeInBits() == SrcEltBits) {
10709 Src = Src.getOperand(0);
10710 } else if (Subtarget.hasVLX()) {
10711 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10713 Src = DAG.getBitcast(SrcVT, Src);
10714 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10715 if (Scale == 2 &&
10716 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10717 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10718 return SDValue();
10719 } else
10720 return SDValue();
10721
10722 // VPMOVWB is only available with avx512bw.
10723 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10724 return SDValue();
10725
10726 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10727 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10728 }
10729
10730 return SDValue();
10731}
10732
10733// Attempt to match binary shuffle patterns as a truncate.
10735 SDValue V2, ArrayRef<int> Mask,
10736 const APInt &Zeroable,
10737 const X86Subtarget &Subtarget,
10738 SelectionDAG &DAG) {
10739 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10740 "Unexpected VTRUNC type");
10741 if (!Subtarget.hasAVX512() ||
10742 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10743 return SDValue();
10744
10745 unsigned NumElts = VT.getVectorNumElements();
10746 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10747 unsigned MaxScale = 64 / EltSizeInBits;
10748 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10749 // TODO: Support non-BWI VPMOVWB truncations?
10750 unsigned SrcEltBits = EltSizeInBits * Scale;
10751 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10752 continue;
10753
10754 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10755 // Bail if the V2 elements are undef.
10756 unsigned NumHalfSrcElts = NumElts / Scale;
10757 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10758 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10759 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10760 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10761 continue;
10762
10763 // The elements beyond the truncation must be undef/zero.
10764 unsigned UpperElts = NumElts - NumSrcElts;
10765 if (UpperElts > 0 &&
10766 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10767 continue;
10768 bool UndefUppers =
10769 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10770
10771 // As we're using both sources then we need to concat them together
10772 // and truncate from the double-sized src.
10773 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10774
10775 // For offset truncations, ensure that the concat is cheap.
10776 SDValue Src =
10777 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10778 if (!Src) {
10779 if (Offset)
10780 continue;
10781 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10782 }
10783
10784 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10785 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10786 Src = DAG.getBitcast(SrcVT, Src);
10787
10788 // Shift the offset'd elements into place for the truncation.
10789 // TODO: Use getTargetVShiftByConstNode.
10790 if (Offset)
10791 Src = DAG.getNode(
10792 X86ISD::VSRLI, DL, SrcVT, Src,
10793 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10794
10795 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10796 }
10797 }
10798
10799 return SDValue();
10800}
10801
10802/// Check whether a compaction lowering can be done by dropping even/odd
10803/// elements and compute how many times even/odd elements must be dropped.
10804///
10805/// This handles shuffles which take every Nth element where N is a power of
10806/// two. Example shuffle masks:
10807///
10808/// (even)
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10810/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10811/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10812/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10813/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10814/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10815///
10816/// (odd)
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10818/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10819///
10820/// Any of these lanes can of course be undef.
10821///
10822/// This routine only supports N <= 3.
10823/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10824/// for larger N.
10825///
10826/// \returns N above, or the number of times even/odd elements must be dropped
10827/// if there is such a number. Otherwise returns zero.
10828static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10829 bool IsSingleInput) {
10830 // The modulus for the shuffle vector entries is based on whether this is
10831 // a single input or not.
10832 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10833 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10834 "We should only be called with masks with a power-of-2 size!");
10835
10836 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10837 int Offset = MatchEven ? 0 : 1;
10838
10839 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10840 // and 2^3 simultaneously. This is because we may have ambiguity with
10841 // partially undef inputs.
10842 bool ViableForN[3] = {true, true, true};
10843
10844 for (int i = 0, e = Mask.size(); i < e; ++i) {
10845 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10846 // want.
10847 if (Mask[i] < 0)
10848 continue;
10849
10850 bool IsAnyViable = false;
10851 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10852 if (ViableForN[j]) {
10853 uint64_t N = j + 1;
10854
10855 // The shuffle mask must be equal to (i * 2^N) % M.
10856 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10857 IsAnyViable = true;
10858 else
10859 ViableForN[j] = false;
10860 }
10861 // Early exit if we exhaust the possible powers of two.
10862 if (!IsAnyViable)
10863 break;
10864 }
10865
10866 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10867 if (ViableForN[j])
10868 return j + 1;
10869
10870 // Return 0 as there is no viable power of two.
10871 return 0;
10872}
10873
10874// X86 has dedicated pack instructions that can handle specific truncation
10875// operations: PACKSS and PACKUS.
10876// Checks for compaction shuffle masks if MaxStages > 1.
10877// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10878static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10879 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10880 const SelectionDAG &DAG,
10881 const X86Subtarget &Subtarget,
10882 unsigned MaxStages = 1) {
10883 unsigned NumElts = VT.getVectorNumElements();
10884 unsigned BitSize = VT.getScalarSizeInBits();
10885 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10886 "Illegal maximum compaction");
10887
10888 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10889 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10890 unsigned NumPackedBits = NumSrcBits - BitSize;
10891 N1 = peekThroughBitcasts(N1);
10892 N2 = peekThroughBitcasts(N2);
10893 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10894 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10895 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10896 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10897 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10898 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10899 return false;
10900 if (Subtarget.hasSSE41() || BitSize == 8) {
10901 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10902 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10903 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10904 V1 = N1;
10905 V2 = N2;
10906 SrcVT = PackVT;
10907 PackOpcode = X86ISD::PACKUS;
10908 return true;
10909 }
10910 }
10911 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10912 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10913 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10914 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10915 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10916 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10917 V1 = N1;
10918 V2 = N2;
10919 SrcVT = PackVT;
10920 PackOpcode = X86ISD::PACKSS;
10921 return true;
10922 }
10923 return false;
10924 };
10925
10926 // Attempt to match against wider and wider compaction patterns.
10927 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10928 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10929 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10930
10931 // Try binary shuffle.
10932 SmallVector<int, 32> BinaryMask;
10933 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10934 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10935 if (MatchPACK(V1, V2, PackVT))
10936 return true;
10937
10938 // Try unary shuffle.
10939 SmallVector<int, 32> UnaryMask;
10940 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10941 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10942 if (MatchPACK(V1, V1, PackVT))
10943 return true;
10944 }
10945
10946 return false;
10947}
10948
10950 SDValue V2, ArrayRef<int> Mask,
10951 const X86Subtarget &Subtarget,
10952 SelectionDAG &DAG) {
10953 MVT PackVT;
10954 unsigned PackOpcode;
10955 unsigned SizeBits = VT.getSizeInBits();
10956 unsigned EltBits = VT.getScalarSizeInBits();
10957 unsigned MaxStages = Log2_32(64 / EltBits);
10958 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10959 Subtarget, MaxStages))
10960 return SDValue();
10961
10962 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10963 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10964
10965 // Don't lower multi-stage packs on AVX512, truncation is better.
10966 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10967 return SDValue();
10968
10969 // Pack to the largest type possible:
10970 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10971 unsigned MaxPackBits = 16;
10972 if (CurrentEltBits > 16 &&
10973 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10974 MaxPackBits = 32;
10975
10976 // Repeatedly pack down to the target size.
10977 SDValue Res;
10978 for (unsigned i = 0; i != NumStages; ++i) {
10979 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10980 unsigned NumSrcElts = SizeBits / SrcEltBits;
10981 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10982 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10983 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10984 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10985 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10986 DAG.getBitcast(SrcVT, V2));
10987 V1 = V2 = Res;
10988 CurrentEltBits /= 2;
10989 }
10990 assert(Res && Res.getValueType() == VT &&
10991 "Failed to lower compaction shuffle");
10992 return Res;
10993}
10994
10995/// Try to emit a bitmask instruction for a shuffle.
10996///
10997/// This handles cases where we can model a blend exactly as a bitmask due to
10998/// one of the inputs being zeroable.
11000 SDValue V2, ArrayRef<int> Mask,
11001 const APInt &Zeroable,
11002 const X86Subtarget &Subtarget,
11003 SelectionDAG &DAG) {
11004 MVT MaskVT = VT;
11005 MVT EltVT = VT.getVectorElementType();
11006 SDValue Zero, AllOnes;
11007 // Use f64 if i64 isn't legal.
11008 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11009 EltVT = MVT::f64;
11010 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11011 }
11012
11013 MVT LogicVT = VT;
11014 if (EltVT.isFloatingPoint()) {
11015 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11016 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11017 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11018 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11019 } else {
11020 Zero = DAG.getConstant(0, DL, EltVT);
11021 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11022 }
11023
11024 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11025 SDValue V;
11026 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11027 if (Zeroable[i])
11028 continue;
11029 if (Mask[i] % Size != i)
11030 return SDValue(); // Not a blend.
11031 if (!V)
11032 V = Mask[i] < Size ? V1 : V2;
11033 else if (V != (Mask[i] < Size ? V1 : V2))
11034 return SDValue(); // Can only let one input through the mask.
11035
11036 VMaskOps[i] = AllOnes;
11037 }
11038 if (!V)
11039 return SDValue(); // No non-zeroable elements!
11040
11041 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11042 VMask = DAG.getBitcast(LogicVT, VMask);
11043 V = DAG.getBitcast(LogicVT, V);
11044 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11045 return DAG.getBitcast(VT, And);
11046}
11047
11048/// Try to emit a blend instruction for a shuffle using bit math.
11049///
11050/// This is used as a fallback approach when first class blend instructions are
11051/// unavailable. Currently it is only suitable for integer vectors, but could
11052/// be generalized for floating point vectors if desirable.
11054 SDValue V2, ArrayRef<int> Mask,
11055 SelectionDAG &DAG) {
11056 assert(VT.isInteger() && "Only supports integer vector types!");
11057 MVT EltVT = VT.getVectorElementType();
11058 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11059 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11061 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11062 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11063 return SDValue(); // Shuffled input!
11064 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11065 }
11066
11067 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11068 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11069}
11070
11072 SDValue PreservedSrc,
11073 const X86Subtarget &Subtarget,
11074 SelectionDAG &DAG);
11075
11078 const APInt &Zeroable, bool &ForceV1Zero,
11079 bool &ForceV2Zero, uint64_t &BlendMask) {
11080 bool V1IsZeroOrUndef =
11082 bool V2IsZeroOrUndef =
11084
11085 BlendMask = 0;
11086 ForceV1Zero = false, ForceV2Zero = false;
11087 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11088
11089 int NumElts = Mask.size();
11090 int NumLanes = VT.getSizeInBits() / 128;
11091 int NumEltsPerLane = NumElts / NumLanes;
11092 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11093
11094 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11095 // then ensure the blend mask part for that lane just references that input.
11096 bool ForceWholeLaneMasks =
11097 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11098
11099 // Attempt to generate the binary blend mask. If an input is zero then
11100 // we can use any lane.
11101 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11102 // Keep track of the inputs used per lane.
11103 bool LaneV1InUse = false;
11104 bool LaneV2InUse = false;
11105 uint64_t LaneBlendMask = 0;
11106 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11107 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11108 int M = Mask[Elt];
11109 if (M == SM_SentinelUndef)
11110 continue;
11111 if (M == Elt || (0 <= M && M < NumElts &&
11112 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11113 Mask[Elt] = Elt;
11114 LaneV1InUse = true;
11115 continue;
11116 }
11117 if (M == (Elt + NumElts) ||
11118 (NumElts <= M &&
11119 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 if (Zeroable[Elt]) {
11126 if (V1IsZeroOrUndef) {
11127 ForceV1Zero = true;
11128 Mask[Elt] = Elt;
11129 LaneV1InUse = true;
11130 continue;
11131 }
11132 if (V2IsZeroOrUndef) {
11133 ForceV2Zero = true;
11134 LaneBlendMask |= 1ull << LaneElt;
11135 Mask[Elt] = Elt + NumElts;
11136 LaneV2InUse = true;
11137 continue;
11138 }
11139 }
11140 return false;
11141 }
11142
11143 // If we only used V2 then splat the lane blend mask to avoid any demanded
11144 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11145 // blend mask bit).
11146 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11147 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11148
11149 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11150 }
11151 return true;
11152}
11153
11154/// Try to emit a blend instruction for a shuffle.
11155///
11156/// This doesn't do any checks for the availability of instructions for blending
11157/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11158/// be matched in the backend with the type given. What it does check for is
11159/// that the shuffle mask is a blend, or convertible into a blend with zero.
11161 SDValue V2, ArrayRef<int> Original,
11162 const APInt &Zeroable,
11163 const X86Subtarget &Subtarget,
11164 SelectionDAG &DAG) {
11165 uint64_t BlendMask = 0;
11166 bool ForceV1Zero = false, ForceV2Zero = false;
11167 SmallVector<int, 64> Mask(Original);
11168 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11169 BlendMask))
11170 return SDValue();
11171
11172 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11173 if (ForceV1Zero)
11174 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11175 if (ForceV2Zero)
11176 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11177
11178 unsigned NumElts = VT.getVectorNumElements();
11179
11180 switch (VT.SimpleTy) {
11181 case MVT::v4i64:
11182 case MVT::v8i32:
11183 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11184 [[fallthrough]];
11185 case MVT::v4f64:
11186 case MVT::v8f32:
11187 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11188 [[fallthrough]];
11189 case MVT::v2f64:
11190 case MVT::v2i64:
11191 case MVT::v4f32:
11192 case MVT::v4i32:
11193 case MVT::v8i16:
11194 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11195 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11196 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11197 case MVT::v16i16: {
11198 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11199 SmallVector<int, 8> RepeatedMask;
11200 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11201 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11202 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11203 BlendMask = 0;
11204 for (int i = 0; i < 8; ++i)
11205 if (RepeatedMask[i] >= 8)
11206 BlendMask |= 1ull << i;
11207 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11208 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11209 }
11210 // Use PBLENDW for lower/upper lanes and then blend lanes.
11211 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11212 // merge to VSELECT where useful.
11213 uint64_t LoMask = BlendMask & 0xFF;
11214 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11215 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11216 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11217 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11218 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11219 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11220 return DAG.getVectorShuffle(
11221 MVT::v16i16, DL, Lo, Hi,
11222 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11223 }
11224 [[fallthrough]];
11225 }
11226 case MVT::v32i8:
11227 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11228 [[fallthrough]];
11229 case MVT::v16i8: {
11230 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11231
11232 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11233 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11234 Subtarget, DAG))
11235 return Masked;
11236
11237 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11238 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11239 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11240 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11241 }
11242
11243 // If we have VPTERNLOG, we can use that as a bit blend.
11244 if (Subtarget.hasVLX())
11245 if (SDValue BitBlend =
11246 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11247 return BitBlend;
11248
11249 // Scale the blend by the number of bytes per element.
11250 int Scale = VT.getScalarSizeInBits() / 8;
11251
11252 // This form of blend is always done on bytes. Compute the byte vector
11253 // type.
11254 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11255
11256 // x86 allows load folding with blendvb from the 2nd source operand. But
11257 // we are still using LLVM select here (see comment below), so that's V1.
11258 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11259 // allow that load-folding possibility.
11260 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11262 std::swap(V1, V2);
11263 }
11264
11265 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11266 // mix of LLVM's code generator and the x86 backend. We tell the code
11267 // generator that boolean values in the elements of an x86 vector register
11268 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11269 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11270 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11271 // of the element (the remaining are ignored) and 0 in that high bit would
11272 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11273 // the LLVM model for boolean values in vector elements gets the relevant
11274 // bit set, it is set backwards and over constrained relative to x86's
11275 // actual model.
11276 SmallVector<SDValue, 32> VSELECTMask;
11277 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11278 for (int j = 0; j < Scale; ++j)
11279 VSELECTMask.push_back(
11280 Mask[i] < 0
11281 ? DAG.getUNDEF(MVT::i8)
11282 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11283
11284 V1 = DAG.getBitcast(BlendVT, V1);
11285 V2 = DAG.getBitcast(BlendVT, V2);
11286 return DAG.getBitcast(
11287 VT,
11288 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11289 V1, V2));
11290 }
11291 case MVT::v16f32:
11292 case MVT::v8f64:
11293 case MVT::v8i64:
11294 case MVT::v16i32:
11295 case MVT::v32i16:
11296 case MVT::v64i8: {
11297 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11298 bool OptForSize = DAG.shouldOptForSize();
11299 if (!OptForSize) {
11300 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11301 Subtarget, DAG))
11302 return Masked;
11303 }
11304
11305 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11306 // masked move.
11307 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11308 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11309 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11310 }
11311 default:
11312 llvm_unreachable("Not a supported integer vector type!");
11313 }
11314}
11315
11316/// Try to lower as a blend of elements from two inputs followed by
11317/// a single-input permutation.
11318///
11319/// This matches the pattern where we can blend elements from two inputs and
11320/// then reduce the shuffle to a single-input permutation.
11322 SDValue V1, SDValue V2,
11323 ArrayRef<int> Mask,
11324 SelectionDAG &DAG,
11325 bool ImmBlends = false) {
11326 // We build up the blend mask while checking whether a blend is a viable way
11327 // to reduce the shuffle.
11328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11330
11331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11332 if (Mask[i] < 0)
11333 continue;
11334
11335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11336
11337 if (BlendMask[Mask[i] % Size] < 0)
11338 BlendMask[Mask[i] % Size] = Mask[i];
11339 else if (BlendMask[Mask[i] % Size] != Mask[i])
11340 return SDValue(); // Can't blend in the needed input!
11341
11342 PermuteMask[i] = Mask[i] % Size;
11343 }
11344
11345 // If only immediate blends, then bail if the blend mask can't be widened to
11346 // i16.
11347 unsigned EltSize = VT.getScalarSizeInBits();
11348 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11349 return SDValue();
11350
11351 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11352 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11353}
11354
11355/// Try to lower as an unpack of elements from two inputs followed by
11356/// a single-input permutation.
11357///
11358/// This matches the pattern where we can unpack elements from two inputs and
11359/// then reduce the shuffle to a single-input (wider) permutation.
11361 SDValue V1, SDValue V2,
11362 ArrayRef<int> Mask,
11363 SelectionDAG &DAG) {
11364 int NumElts = Mask.size();
11365 int NumLanes = VT.getSizeInBits() / 128;
11366 int NumLaneElts = NumElts / NumLanes;
11367 int NumHalfLaneElts = NumLaneElts / 2;
11368
11369 bool MatchLo = true, MatchHi = true;
11370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11371
11372 // Determine UNPCKL/UNPCKH type and operand order.
11373 for (int Elt = 0; Elt != NumElts; ++Elt) {
11374 int M = Mask[Elt];
11375 if (M < 0)
11376 continue;
11377
11378 // Normalize the mask value depending on whether it's V1 or V2.
11379 int NormM = M;
11380 SDValue &Op = Ops[Elt & 1];
11381 if (M < NumElts && (Op.isUndef() || Op == V1))
11382 Op = V1;
11383 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11384 Op = V2;
11385 NormM -= NumElts;
11386 } else
11387 return SDValue();
11388
11389 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11390 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11391 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11392 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11393 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11394 if (MatchLoAnyLane || MatchHiAnyLane) {
11395 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11396 "Failed to match UNPCKLO/UNPCKHI");
11397 break;
11398 }
11399 }
11400 MatchLo &= MatchLoAnyLane;
11401 MatchHi &= MatchHiAnyLane;
11402 if (!MatchLo && !MatchHi)
11403 return SDValue();
11404 }
11405 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11406
11407 // Element indices have changed after unpacking. Calculate permute mask
11408 // so that they will be put back to the position as dictated by the
11409 // original shuffle mask indices.
11410 SmallVector<int, 32> PermuteMask(NumElts, -1);
11411 for (int Elt = 0; Elt != NumElts; ++Elt) {
11412 int M = Mask[Elt];
11413 if (M < 0)
11414 continue;
11415 int NormM = M;
11416 if (NumElts <= M)
11417 NormM -= NumElts;
11418 bool IsFirstOp = M < NumElts;
11419 int BaseMaskElt =
11420 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11421 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11422 PermuteMask[Elt] = BaseMaskElt;
11423 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11424 PermuteMask[Elt] = BaseMaskElt + 1;
11425 assert(PermuteMask[Elt] != -1 &&
11426 "Input mask element is defined but failed to assign permute mask");
11427 }
11428
11429 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11430 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11431 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11432}
11433
11434/// Try to lower a shuffle as a permute of the inputs followed by an
11435/// UNPCK instruction.
11436///
11437/// This specifically targets cases where we end up with alternating between
11438/// the two inputs, and so can permute them into something that feeds a single
11439/// UNPCK instruction. Note that this routine only targets integer vectors
11440/// because for floating point vectors we have a generalized SHUFPS lowering
11441/// strategy that handles everything that doesn't *exactly* match an unpack,
11442/// making this clever lowering unnecessary.
11444 SDValue V1, SDValue V2,
11445 ArrayRef<int> Mask,
11446 const X86Subtarget &Subtarget,
11447 SelectionDAG &DAG) {
11448 int Size = Mask.size();
11449 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11450
11451 // This routine only supports 128-bit integer dual input vectors.
11452 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11453 return SDValue();
11454
11455 int NumLoInputs =
11456 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11457 int NumHiInputs =
11458 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11459
11460 bool UnpackLo = NumLoInputs >= NumHiInputs;
11461
11462 auto TryUnpack = [&](int ScalarSize, int Scale) {
11463 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11464 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11465
11466 for (int i = 0; i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 // Each element of the unpack contains Scale elements from this mask.
11471 int UnpackIdx = i / Scale;
11472
11473 // We only handle the case where V1 feeds the first slots of the unpack.
11474 // We rely on canonicalization to ensure this is the case.
11475 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11476 return SDValue();
11477
11478 // Setup the mask for this input. The indexing is tricky as we have to
11479 // handle the unpack stride.
11480 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11481 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11482 Mask[i] % Size;
11483 }
11484
11485 // If we will have to shuffle both inputs to use the unpack, check whether
11486 // we can just unpack first and shuffle the result. If so, skip this unpack.
11487 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11488 !isNoopShuffleMask(V2Mask))
11489 return SDValue();
11490
11491 // Shuffle the inputs into place.
11492 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11493 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11494
11495 // Cast the inputs to the type we will use to unpack them.
11496 MVT UnpackVT =
11497 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11498 V1 = DAG.getBitcast(UnpackVT, V1);
11499 V2 = DAG.getBitcast(UnpackVT, V2);
11500
11501 // Unpack the inputs and cast the result back to the desired type.
11502 return DAG.getBitcast(
11503 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11504 UnpackVT, V1, V2));
11505 };
11506
11507 // We try each unpack from the largest to the smallest to try and find one
11508 // that fits this mask.
11509 int OrigScalarSize = VT.getScalarSizeInBits();
11510 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11511 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11512 return Unpack;
11513
11514 // If we're shuffling with a zero vector then we're better off not doing
11515 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11518 return SDValue();
11519
11520 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11521 // initial unpack.
11522 if (NumLoInputs == 0 || NumHiInputs == 0) {
11523 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11524 "We have to have *some* inputs!");
11525 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11526
11527 // FIXME: We could consider the total complexity of the permute of each
11528 // possible unpacking. Or at the least we should consider how many
11529 // half-crossings are created.
11530 // FIXME: We could consider commuting the unpacks.
11531
11532 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11533 for (int i = 0; i < Size; ++i) {
11534 if (Mask[i] < 0)
11535 continue;
11536
11537 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11538
11539 PermMask[i] =
11540 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11541 }
11542 return DAG.getVectorShuffle(
11543 VT, DL,
11544 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11545 V1, V2),
11546 DAG.getUNDEF(VT), PermMask);
11547 }
11548
11549 return SDValue();
11550}
11551
11552/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11553/// permuting the elements of the result in place.
11555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11557 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11558 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11559 (VT.is512BitVector() && !Subtarget.hasBWI()))
11560 return SDValue();
11561
11562 // We don't currently support lane crossing permutes.
11563 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11564 return SDValue();
11565
11566 int Scale = VT.getScalarSizeInBits() / 8;
11567 int NumLanes = VT.getSizeInBits() / 128;
11568 int NumElts = VT.getVectorNumElements();
11569 int NumEltsPerLane = NumElts / NumLanes;
11570
11571 // Determine range of mask elts.
11572 bool Blend1 = true;
11573 bool Blend2 = true;
11574 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11575 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11576 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11577 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11578 int M = Mask[Lane + Elt];
11579 if (M < 0)
11580 continue;
11581 if (M < NumElts) {
11582 Blend1 &= (M == (Lane + Elt));
11583 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11584 M = M % NumEltsPerLane;
11585 Range1.first = std::min(Range1.first, M);
11586 Range1.second = std::max(Range1.second, M);
11587 } else {
11588 M -= NumElts;
11589 Blend2 &= (M == (Lane + Elt));
11590 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11591 M = M % NumEltsPerLane;
11592 Range2.first = std::min(Range2.first, M);
11593 Range2.second = std::max(Range2.second, M);
11594 }
11595 }
11596 }
11597
11598 // Bail if we don't need both elements.
11599 // TODO - it might be worth doing this for unary shuffles if the permute
11600 // can be widened.
11601 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11602 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11603 return SDValue();
11604
11605 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11606 return SDValue();
11607
11608 // Rotate the 2 ops so we can access both ranges, then permute the result.
11609 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11610 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11611 SDValue Rotate = DAG.getBitcast(
11612 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11613 DAG.getBitcast(ByteVT, Lo),
11614 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11615 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11616 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11617 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11618 int M = Mask[Lane + Elt];
11619 if (M < 0)
11620 continue;
11621 if (M < NumElts)
11622 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11623 else
11624 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11625 }
11626 }
11627 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11628 };
11629
11630 // Check if the ranges are small enough to rotate from either direction.
11631 if (Range2.second < Range1.first)
11632 return RotateAndPermute(V1, V2, Range1.first, 0);
11633 if (Range1.second < Range2.first)
11634 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11635 return SDValue();
11636}
11637
11639 return isUndefOrEqual(Mask, 0);
11640}
11641
11643 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11644}
11645
11646/// Check if the Mask consists of the same element repeated multiple times.
11648 size_t NumUndefs = 0;
11649 std::optional<int> UniqueElt;
11650 for (int Elt : Mask) {
11651 if (Elt == SM_SentinelUndef) {
11652 NumUndefs++;
11653 continue;
11654 }
11655 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11656 return false;
11657 UniqueElt = Elt;
11658 }
11659 // Make sure the element is repeated enough times by checking the number of
11660 // undefs is small.
11661 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11662}
11663
11664/// Generic routine to decompose a shuffle and blend into independent
11665/// blends and permutes.
11666///
11667/// This matches the extremely common pattern for handling combined
11668/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11669/// operations. It will try to pick the best arrangement of shuffles and
11670/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11672 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11673 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11674 int NumElts = Mask.size();
11675 int NumLanes = VT.getSizeInBits() / 128;
11676 int NumEltsPerLane = NumElts / NumLanes;
11677
11678 // Shuffle the input elements into the desired positions in V1 and V2 and
11679 // unpack/blend them together.
11680 bool IsAlternating = true;
11681 bool V1Zero = true, V2Zero = true;
11682 SmallVector<int, 32> V1Mask(NumElts, -1);
11683 SmallVector<int, 32> V2Mask(NumElts, -1);
11684 SmallVector<int, 32> FinalMask(NumElts, -1);
11685 for (int i = 0; i < NumElts; ++i) {
11686 int M = Mask[i];
11687 if (M >= 0 && M < NumElts) {
11688 V1Mask[i] = M;
11689 FinalMask[i] = i;
11690 V1Zero &= Zeroable[i];
11691 IsAlternating &= (i & 1) == 0;
11692 } else if (M >= NumElts) {
11693 V2Mask[i] = M - NumElts;
11694 FinalMask[i] = i + NumElts;
11695 V2Zero &= Zeroable[i];
11696 IsAlternating &= (i & 1) == 1;
11697 }
11698 }
11699
11700 // If we effectively only demand the 0'th element of \p Input, and not only
11701 // as 0'th element, then broadcast said input,
11702 // and change \p InputMask to be a no-op (identity) mask.
11703 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11704 &DAG](SDValue &Input,
11705 MutableArrayRef<int> InputMask) {
11706 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11707 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11708 !X86::mayFoldLoad(Input, Subtarget)))
11709 return;
11710 if (isNoopShuffleMask(InputMask))
11711 return;
11712 assert(isBroadcastShuffleMask(InputMask) &&
11713 "Expected to demand only the 0'th element.");
11715 for (auto I : enumerate(InputMask)) {
11716 int &InputMaskElt = I.value();
11717 if (InputMaskElt >= 0)
11718 InputMaskElt = I.index();
11719 }
11720 };
11721
11722 // Currently, we may need to produce one shuffle per input, and blend results.
11723 // It is possible that the shuffle for one of the inputs is already a no-op.
11724 // See if we can simplify non-no-op shuffles into broadcasts,
11725 // which we consider to be strictly better than an arbitrary shuffle.
11726 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11728 canonicalizeBroadcastableInput(V1, V1Mask);
11729 canonicalizeBroadcastableInput(V2, V2Mask);
11730 }
11731
11732 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11733 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11734 // the shuffle may be able to fold with a load or other benefit. However, when
11735 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11736 // pre-shuffle first is a better strategy.
11737 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11738 // If we don't have blends, see if we can create a cheap unpack.
11739 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11740 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11741 is128BitUnpackShuffleMask(V2Mask, DAG)))
11742 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11743 DL, VT, V1, V2, Mask, Subtarget, DAG))
11744 return PermUnpack;
11745
11746 // Only prefer immediate blends to unpack/rotate.
11747 if (SDValue BlendPerm =
11748 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11749 return BlendPerm;
11750
11751 // If either input vector provides only a single element which is repeated
11752 // multiple times, unpacking from both input vectors would generate worse
11753 // code. e.g. for
11754 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11755 // it is better to process t4 first to create a vector of t4[0], then unpack
11756 // that vector with t2.
11757 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11759 if (SDValue UnpackPerm =
11760 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11761 return UnpackPerm;
11762
11764 DL, VT, V1, V2, Mask, Subtarget, DAG))
11765 return RotatePerm;
11766
11767 // Unpack/rotate failed - try again with variable blends.
11768 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11769 DAG))
11770 return BlendPerm;
11771
11772 if (VT.getScalarSizeInBits() >= 32)
11773 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11774 DL, VT, V1, V2, Mask, Subtarget, DAG))
11775 return PermUnpack;
11776 }
11777
11778 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11779 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11780 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11781 // than half the elements coming from each source.
11782 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11783 V1Mask.assign(NumElts, -1);
11784 V2Mask.assign(NumElts, -1);
11785 FinalMask.assign(NumElts, -1);
11786 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11787 for (int j = 0; j != NumEltsPerLane; ++j) {
11788 int M = Mask[i + j];
11789 if (M >= 0 && M < NumElts) {
11790 V1Mask[i + (j / 2)] = M;
11791 FinalMask[i + j] = i + (j / 2);
11792 } else if (M >= NumElts) {
11793 V2Mask[i + (j / 2)] = M - NumElts;
11794 FinalMask[i + j] = i + (j / 2) + NumElts;
11795 }
11796 }
11797 }
11798
11799 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11801 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11802}
11803
11804static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11805 const X86Subtarget &Subtarget,
11806 ArrayRef<int> Mask) {
11807 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11808 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11809
11810 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11811 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11812 int MaxSubElts = 64 / EltSizeInBits;
11813 unsigned RotateAmt, NumSubElts;
11814 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11815 MaxSubElts, NumSubElts, RotateAmt))
11816 return -1;
11817 unsigned NumElts = Mask.size();
11818 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11819 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11820 return RotateAmt;
11821}
11822
11823/// Lower shuffle using X86ISD::VROTLI rotations.
11825 ArrayRef<int> Mask,
11826 const X86Subtarget &Subtarget,
11827 SelectionDAG &DAG) {
11828 // Only XOP + AVX512 targets have bit rotation instructions.
11829 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11830 bool IsLegal =
11831 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11832 if (!IsLegal && Subtarget.hasSSE3())
11833 return SDValue();
11834
11835 MVT RotateVT;
11836 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11837 Subtarget, Mask);
11838 if (RotateAmt < 0)
11839 return SDValue();
11840
11841 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11842 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11843 // widen to vXi16 or more then existing lowering should will be better.
11844 if (!IsLegal) {
11845 if ((RotateAmt % 16) == 0)
11846 return SDValue();
11847 // TODO: Use getTargetVShiftByConstNode.
11848 unsigned ShlAmt = RotateAmt;
11849 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11850 V1 = DAG.getBitcast(RotateVT, V1);
11851 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11852 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11853 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11854 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11855 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11856 return DAG.getBitcast(VT, Rot);
11857 }
11858
11859 SDValue Rot =
11860 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11861 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11862 return DAG.getBitcast(VT, Rot);
11863}
11864
11865/// Try to match a vector shuffle as an element rotation.
11866///
11867/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11869 ArrayRef<int> Mask) {
11870 int NumElts = Mask.size();
11871
11872 // We need to detect various ways of spelling a rotation:
11873 // [11, 12, 13, 14, 15, 0, 1, 2]
11874 // [-1, 12, 13, 14, -1, -1, 1, -1]
11875 // [-1, -1, -1, -1, -1, -1, 1, 2]
11876 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11877 // [-1, 4, 5, 6, -1, -1, 9, -1]
11878 // [-1, 4, 5, 6, -1, -1, -1, -1]
11879 int Rotation = 0;
11880 SDValue Lo, Hi;
11881 for (int i = 0; i < NumElts; ++i) {
11882 int M = Mask[i];
11883 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11884 "Unexpected mask index.");
11885 if (M < 0)
11886 continue;
11887
11888 // Determine where a rotated vector would have started.
11889 int StartIdx = i - (M % NumElts);
11890 if (StartIdx == 0)
11891 // The identity rotation isn't interesting, stop.
11892 return -1;
11893
11894 // If we found the tail of a vector the rotation must be the missing
11895 // front. If we found the head of a vector, it must be how much of the
11896 // head.
11897 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11898
11899 if (Rotation == 0)
11900 Rotation = CandidateRotation;
11901 else if (Rotation != CandidateRotation)
11902 // The rotations don't match, so we can't match this mask.
11903 return -1;
11904
11905 // Compute which value this mask is pointing at.
11906 SDValue MaskV = M < NumElts ? V1 : V2;
11907
11908 // Compute which of the two target values this index should be assigned
11909 // to. This reflects whether the high elements are remaining or the low
11910 // elements are remaining.
11911 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11912
11913 // Either set up this value if we've not encountered it before, or check
11914 // that it remains consistent.
11915 if (!TargetV)
11916 TargetV = MaskV;
11917 else if (TargetV != MaskV)
11918 // This may be a rotation, but it pulls from the inputs in some
11919 // unsupported interleaving.
11920 return -1;
11921 }
11922
11923 // Check that we successfully analyzed the mask, and normalize the results.
11924 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11925 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11926 if (!Lo)
11927 Lo = Hi;
11928 else if (!Hi)
11929 Hi = Lo;
11930
11931 V1 = Lo;
11932 V2 = Hi;
11933
11934 return Rotation;
11935}
11936
11937/// Try to lower a vector shuffle as a byte rotation.
11938///
11939/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11940/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11941/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11942/// try to generically lower a vector shuffle through such an pattern. It
11943/// does not check for the profitability of lowering either as PALIGNR or
11944/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11945/// This matches shuffle vectors that look like:
11946///
11947/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11948///
11949/// Essentially it concatenates V1 and V2, shifts right by some number of
11950/// elements, and takes the low elements as the result. Note that while this is
11951/// specified as a *right shift* because x86 is little-endian, it is a *left
11952/// rotate* of the vector lanes.
11954 ArrayRef<int> Mask) {
11955 // Don't accept any shuffles with zero elements.
11956 if (isAnyZero(Mask))
11957 return -1;
11958
11959 // PALIGNR works on 128-bit lanes.
11960 SmallVector<int, 16> RepeatedMask;
11961 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11962 return -1;
11963
11964 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11965 if (Rotation <= 0)
11966 return -1;
11967
11968 // PALIGNR rotates bytes, so we need to scale the
11969 // rotation based on how many bytes are in the vector lane.
11970 int NumElts = RepeatedMask.size();
11971 int Scale = 16 / NumElts;
11972 return Rotation * Scale;
11973}
11974
11976 SDValue V2, ArrayRef<int> Mask,
11977 const X86Subtarget &Subtarget,
11978 SelectionDAG &DAG) {
11979 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11980
11981 SDValue Lo = V1, Hi = V2;
11982 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11983 if (ByteRotation <= 0)
11984 return SDValue();
11985
11986 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11987 // PSLLDQ/PSRLDQ.
11988 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11989 Lo = DAG.getBitcast(ByteVT, Lo);
11990 Hi = DAG.getBitcast(ByteVT, Hi);
11991
11992 // SSSE3 targets can use the palignr instruction.
11993 if (Subtarget.hasSSSE3()) {
11994 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11995 "512-bit PALIGNR requires BWI instructions");
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11998 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11999 }
12000
12001 assert(VT.is128BitVector() &&
12002 "Rotate-based lowering only supports 128-bit lowering!");
12003 assert(Mask.size() <= 16 &&
12004 "Can shuffle at most 16 bytes in a 128-bit vector!");
12005 assert(ByteVT == MVT::v16i8 &&
12006 "SSE2 rotate lowering only needed for v16i8!");
12007
12008 // Default SSE2 implementation
12009 int LoByteShift = 16 - ByteRotation;
12010 int HiByteShift = ByteRotation;
12011
12012 SDValue LoShift =
12013 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12014 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12015 SDValue HiShift =
12016 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12017 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12018 return DAG.getBitcast(VT,
12019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12020}
12021
12022/// Try to lower a vector shuffle as a dword/qword rotation.
12023///
12024/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12025/// rotation of the concatenation of two vectors; This routine will
12026/// try to generically lower a vector shuffle through such an pattern.
12027///
12028/// Essentially it concatenates V1 and V2, shifts right by some number of
12029/// elements, and takes the low elements as the result. Note that while this is
12030/// specified as a *right shift* because x86 is little-endian, it is a *left
12031/// rotate* of the vector lanes.
12033 SDValue V2, ArrayRef<int> Mask,
12034 const APInt &Zeroable,
12035 const X86Subtarget &Subtarget,
12036 SelectionDAG &DAG) {
12037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12038 "Only 32-bit and 64-bit elements are supported!");
12039
12040 // 128/256-bit vectors are only supported with VLX.
12041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12042 && "VLX required for 128/256-bit vectors");
12043
12044 SDValue Lo = V1, Hi = V2;
12045 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12046 if (0 < Rotation)
12047 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12048 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12049
12050 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12051 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12052 // TODO: We can probably make this more aggressive and use shift-pairs like
12053 // lowerShuffleAsByteShiftMask.
12054 unsigned NumElts = Mask.size();
12055 unsigned ZeroLo = Zeroable.countr_one();
12056 unsigned ZeroHi = Zeroable.countl_one();
12057 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12058 if (!ZeroLo && !ZeroHi)
12059 return SDValue();
12060
12061 if (ZeroLo) {
12062 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12063 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12064 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12065 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12066 getZeroVector(VT, Subtarget, DAG, DL),
12067 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12068 }
12069
12070 if (ZeroHi) {
12071 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12072 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12073 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12074 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12075 getZeroVector(VT, Subtarget, DAG, DL), Src,
12076 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12077 }
12078
12079 return SDValue();
12080}
12081
12082/// Try to lower a vector shuffle as a byte shift sequence.
12084 SDValue V2, ArrayRef<int> Mask,
12085 const APInt &Zeroable,
12086 const X86Subtarget &Subtarget,
12087 SelectionDAG &DAG) {
12088 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12089 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12090
12091 // We need a shuffle that has zeros at one/both ends and a sequential
12092 // shuffle from one source within.
12093 unsigned ZeroLo = Zeroable.countr_one();
12094 unsigned ZeroHi = Zeroable.countl_one();
12095 if (!ZeroLo && !ZeroHi)
12096 return SDValue();
12097
12098 unsigned NumElts = Mask.size();
12099 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12100 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12101 return SDValue();
12102
12103 unsigned Scale = VT.getScalarSizeInBits() / 8;
12104 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12105 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12106 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12107 return SDValue();
12108
12109 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12110 Res = DAG.getBitcast(MVT::v16i8, Res);
12111
12112 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12113 // inner sequential set of elements, possibly offset:
12114 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12115 // 01234567 --> 4567zzzz --> zzzzz456
12116 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12117 if (ZeroLo == 0) {
12118 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12119 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12120 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12123 } else if (ZeroHi == 0) {
12124 unsigned Shift = Mask[ZeroLo] % NumElts;
12125 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12129 } else if (!Subtarget.hasSSSE3()) {
12130 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12131 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12132 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12133 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12134 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Shift += Mask[ZeroLo] % NumElts;
12137 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12138 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12139 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12140 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12141 } else
12142 return SDValue();
12143
12144 return DAG.getBitcast(VT, Res);
12145}
12146
12147/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12148///
12149/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12150/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12151/// matches elements from one of the input vectors shuffled to the left or
12152/// right with zeroable elements 'shifted in'. It handles both the strictly
12153/// bit-wise element shifts and the byte shift across an entire 128-bit double
12154/// quad word lane.
12155///
12156/// PSHL : (little-endian) left bit shift.
12157/// [ zz, 0, zz, 2 ]
12158/// [ -1, 4, zz, -1 ]
12159/// PSRL : (little-endian) right bit shift.
12160/// [ 1, zz, 3, zz]
12161/// [ -1, -1, 7, zz]
12162/// PSLLDQ : (little-endian) left byte shift
12163/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12164/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12165/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12166/// PSRLDQ : (little-endian) right byte shift
12167/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12168/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12169/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12170static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12171 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12172 int MaskOffset, const APInt &Zeroable,
12173 const X86Subtarget &Subtarget) {
12174 int Size = Mask.size();
12175 unsigned SizeInBits = Size * ScalarSizeInBits;
12176
12177 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12178 for (int i = 0; i < Size; i += Scale)
12179 for (int j = 0; j < Shift; ++j)
12180 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12181 return false;
12182
12183 return true;
12184 };
12185
12186 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12187 for (int i = 0; i != Size; i += Scale) {
12188 unsigned Pos = Left ? i + Shift : i;
12189 unsigned Low = Left ? i : i + Shift;
12190 unsigned Len = Scale - Shift;
12191 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12192 return -1;
12193 }
12194
12195 int ShiftEltBits = ScalarSizeInBits * Scale;
12196 bool ByteShift = ShiftEltBits > 64;
12197 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12198 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12199 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12200
12201 // Normalize the scale for byte shifts to still produce an i64 element
12202 // type.
12203 Scale = ByteShift ? Scale / 2 : Scale;
12204
12205 // We need to round trip through the appropriate type for the shift.
12206 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12207 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12208 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12209 return (int)ShiftAmt;
12210 };
12211
12212 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12213 // keep doubling the size of the integer elements up to that. We can
12214 // then shift the elements of the integer vector by whole multiples of
12215 // their width within the elements of the larger integer vector. Test each
12216 // multiple to see if we can find a match with the moved element indices
12217 // and that the shifted in elements are all zeroable.
12218 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12219 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12220 for (int Shift = 1; Shift != Scale; ++Shift)
12221 for (bool Left : {true, false})
12222 if (CheckZeros(Shift, Scale, Left)) {
12223 int ShiftAmt = MatchShift(Shift, Scale, Left);
12224 if (0 < ShiftAmt)
12225 return ShiftAmt;
12226 }
12227
12228 // no match
12229 return -1;
12230}
12231
12233 SDValue V2, ArrayRef<int> Mask,
12234 const APInt &Zeroable,
12235 const X86Subtarget &Subtarget,
12236 SelectionDAG &DAG, bool BitwiseOnly) {
12237 int Size = Mask.size();
12238 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12239
12240 MVT ShiftVT;
12241 SDValue V = V1;
12242 unsigned Opcode;
12243
12244 // Try to match shuffle against V1 shift.
12245 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12246 Mask, 0, Zeroable, Subtarget);
12247
12248 // If V1 failed, try to match shuffle against V2 shift.
12249 if (ShiftAmt < 0) {
12250 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12251 Mask, Size, Zeroable, Subtarget);
12252 V = V2;
12253 }
12254
12255 if (ShiftAmt < 0)
12256 return SDValue();
12257
12258 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12259 return SDValue();
12260
12261 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12262 "Illegal integer vector type");
12263 V = DAG.getBitcast(ShiftVT, V);
12264 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12265 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12266 return DAG.getBitcast(VT, V);
12267}
12268
12269// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12270// Remainder of lower half result is zero and upper half is all undef.
12271static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12272 ArrayRef<int> Mask, uint64_t &BitLen,
12273 uint64_t &BitIdx, const APInt &Zeroable) {
12274 int Size = Mask.size();
12275 int HalfSize = Size / 2;
12276 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12277 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12278
12279 // Upper half must be undefined.
12280 if (!isUndefUpperHalf(Mask))
12281 return false;
12282
12283 // Determine the extraction length from the part of the
12284 // lower half that isn't zeroable.
12285 int Len = HalfSize;
12286 for (; Len > 0; --Len)
12287 if (!Zeroable[Len - 1])
12288 break;
12289 assert(Len > 0 && "Zeroable shuffle mask");
12290
12291 // Attempt to match first Len sequential elements from the lower half.
12292 SDValue Src;
12293 int Idx = -1;
12294 for (int i = 0; i != Len; ++i) {
12295 int M = Mask[i];
12296 if (M == SM_SentinelUndef)
12297 continue;
12298 SDValue &V = (M < Size ? V1 : V2);
12299 M = M % Size;
12300
12301 // The extracted elements must start at a valid index and all mask
12302 // elements must be in the lower half.
12303 if (i > M || M >= HalfSize)
12304 return false;
12305
12306 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12307 Src = V;
12308 Idx = M - i;
12309 continue;
12310 }
12311 return false;
12312 }
12313
12314 if (!Src || Idx < 0)
12315 return false;
12316
12317 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12318 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12319 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12320 V1 = Src;
12321 return true;
12322}
12323
12324// INSERTQ: Extract lowest Len elements from lower half of second source and
12325// insert over first source, starting at Idx.
12326// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12327static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12328 ArrayRef<int> Mask, uint64_t &BitLen,
12329 uint64_t &BitIdx) {
12330 int Size = Mask.size();
12331 int HalfSize = Size / 2;
12332 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12333
12334 // Upper half must be undefined.
12335 if (!isUndefUpperHalf(Mask))
12336 return false;
12337
12338 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12339 SDValue Base;
12340
12341 // Attempt to match first source from mask before insertion point.
12342 if (isUndefInRange(Mask, 0, Idx)) {
12343 /* EMPTY */
12344 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12345 Base = V1;
12346 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12347 Base = V2;
12348 } else {
12349 continue;
12350 }
12351
12352 // Extend the extraction length looking to match both the insertion of
12353 // the second source and the remaining elements of the first.
12354 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12355 SDValue Insert;
12356 int Len = Hi - Idx;
12357
12358 // Match insertion.
12359 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12360 Insert = V1;
12361 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12362 Insert = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 // Match the remaining elements of the lower half.
12368 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12369 /* EMPTY */
12370 } else if ((!Base || (Base == V1)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12372 Base = V1;
12373 } else if ((!Base || (Base == V2)) &&
12374 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12375 Size + Hi)) {
12376 Base = V2;
12377 } else {
12378 continue;
12379 }
12380
12381 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12382 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12383 V1 = Base;
12384 V2 = Insert;
12385 return true;
12386 }
12387 }
12388
12389 return false;
12390}
12391
12392/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12394 SDValue V2, ArrayRef<int> Mask,
12395 const APInt &Zeroable, SelectionDAG &DAG) {
12396 uint64_t BitLen, BitIdx;
12397 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12398 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12399 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12400 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12401
12402 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12403 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12404 V2 ? V2 : DAG.getUNDEF(VT),
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 return SDValue();
12409}
12410
12411/// Lower a vector shuffle as an any/signed/zero extension.
12412///
12413/// Given a specific number of elements, element bit width, and extension
12414/// stride, produce either an extension based on the available
12415/// features of the subtarget. The extended elements are consecutive and
12416/// begin and can start from an offsetted element index in the input; to
12417/// avoid excess shuffling the offset must either being in the bottom lane
12418/// or at the start of a higher lane. All extended elements must be from
12419/// the same lane.
12421 int Scale, int Offset,
12422 unsigned ExtOpc, SDValue InputV,
12423 ArrayRef<int> Mask,
12424 const X86Subtarget &Subtarget,
12425 SelectionDAG &DAG) {
12426 assert(Scale > 1 && "Need a scale to extend.");
12427 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12428 int EltBits = VT.getScalarSizeInBits();
12429 int NumElements = VT.getVectorNumElements();
12430 int NumEltsPerLane = 128 / EltBits;
12431 int OffsetLane = Offset / NumEltsPerLane;
12432 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12433 "Only 8, 16, and 32 bit elements can be extended.");
12434 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12435 assert(0 <= Offset && "Extension offset must be positive.");
12436 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12437 "Extension offset must be in the first lane or start an upper lane.");
12438
12439 // Check that an index is in same lane as the base offset.
12440 auto SafeOffset = [&](int Idx) {
12441 return OffsetLane == (Idx / NumEltsPerLane);
12442 };
12443
12444 // Shift along an input so that the offset base moves to the first element.
12445 auto ShuffleOffset = [&](SDValue V) {
12446 if (!Offset)
12447 return V;
12448
12449 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12450 for (int i = 0; i * Scale < NumElements; ++i) {
12451 int SrcIdx = i + Offset;
12452 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12453 }
12454 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12455 };
12456
12457 // Found a valid a/zext mask! Try various lowering strategies based on the
12458 // input type and available ISA extensions.
12459 if (Subtarget.hasSSE41()) {
12460 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12461 // PUNPCK will catch this in a later shuffle match.
12462 if (Offset && Scale == 2 && VT.is128BitVector())
12463 return SDValue();
12464 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12465 NumElements / Scale);
12466 InputV = DAG.getBitcast(VT, InputV);
12467 InputV = ShuffleOffset(InputV);
12468 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12469 return DAG.getBitcast(VT, InputV);
12470 }
12471
12472 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12473 InputV = DAG.getBitcast(VT, InputV);
12474 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12475
12476 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12477 if (ExtOpc == ISD::SIGN_EXTEND)
12478 return SDValue();
12479
12480 // For any extends we can cheat for larger element sizes and use shuffle
12481 // instructions that can fold with a load and/or copy.
12482 if (AnyExt && EltBits == 32) {
12483 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12484 -1};
12485 return DAG.getBitcast(
12486 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12487 DAG.getBitcast(MVT::v4i32, InputV),
12488 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12489 }
12490 if (AnyExt && EltBits == 16 && Scale > 2) {
12491 int PSHUFDMask[4] = {Offset / 2, -1,
12492 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12493 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12494 DAG.getBitcast(MVT::v4i32, InputV),
12495 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12496 int PSHUFWMask[4] = {1, -1, -1, -1};
12497 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12498 return DAG.getBitcast(
12499 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12500 DAG.getBitcast(MVT::v8i16, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12502 }
12503
12504 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12505 // to 64-bits.
12506 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12507 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12508 assert(VT.is128BitVector() && "Unexpected vector width!");
12509
12510 int LoIdx = Offset * EltBits;
12511 SDValue Lo = DAG.getBitcast(
12512 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12513 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12514 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12515
12516 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12517 return DAG.getBitcast(VT, Lo);
12518
12519 int HiIdx = (Offset + 1) * EltBits;
12520 SDValue Hi = DAG.getBitcast(
12521 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12522 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12523 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12524 return DAG.getBitcast(VT,
12525 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12526 }
12527
12528 // If this would require more than 2 unpack instructions to expand, use
12529 // pshufb when available. We can only use more than 2 unpack instructions
12530 // when zero extending i8 elements which also makes it easier to use pshufb.
12531 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12532 assert(NumElements == 16 && "Unexpected byte vector width!");
12533 SDValue PSHUFBMask[16];
12534 for (int i = 0; i < 16; ++i) {
12535 int Idx = Offset + (i / Scale);
12536 if ((i % Scale == 0 && SafeOffset(Idx))) {
12537 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12538 continue;
12539 }
12540 PSHUFBMask[i] =
12541 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12542 }
12543 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12544 return DAG.getBitcast(
12545 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12546 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12547 }
12548
12549 // If we are extending from an offset, ensure we start on a boundary that
12550 // we can unpack from.
12551 int AlignToUnpack = Offset % (NumElements / Scale);
12552 if (AlignToUnpack) {
12553 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12554 for (int i = AlignToUnpack; i < NumElements; ++i)
12555 ShMask[i - AlignToUnpack] = i;
12556 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12557 Offset -= AlignToUnpack;
12558 }
12559
12560 // Otherwise emit a sequence of unpacks.
12561 do {
12562 unsigned UnpackLoHi = X86ISD::UNPCKL;
12563 if (Offset >= (NumElements / 2)) {
12564 UnpackLoHi = X86ISD::UNPCKH;
12565 Offset -= (NumElements / 2);
12566 }
12567
12568 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12569 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12570 : getZeroVector(InputVT, Subtarget, DAG, DL);
12571 InputV = DAG.getBitcast(InputVT, InputV);
12572 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12573 Scale /= 2;
12574 EltBits *= 2;
12575 NumElements /= 2;
12576 } while (Scale > 1);
12577 return DAG.getBitcast(VT, InputV);
12578}
12579
12580/// Try to lower a vector shuffle as a zero extension on any microarch.
12581///
12582/// This routine will try to do everything in its power to cleverly lower
12583/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12584/// check for the profitability of this lowering, it tries to aggressively
12585/// match this pattern. It will use all of the micro-architectural details it
12586/// can to emit an efficient lowering. It handles both blends with all-zero
12587/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12588/// masking out later).
12589///
12590/// The reason we have dedicated lowering for zext-style shuffles is that they
12591/// are both incredibly common and often quite performance sensitive.
12593 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12594 const APInt &Zeroable, const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 int Bits = VT.getSizeInBits();
12597 int NumLanes = Bits / 128;
12598 int NumElements = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElements / NumLanes;
12600 assert(VT.getScalarSizeInBits() <= 32 &&
12601 "Exceeds 32-bit integer zero extension limit");
12602 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12603
12604 // Define a helper function to check a particular ext-scale and lower to it if
12605 // valid.
12606 auto Lower = [&](int Scale) -> SDValue {
12607 SDValue InputV;
12608 bool AnyExt = true;
12609 int Offset = 0;
12610 int Matches = 0;
12611 for (int i = 0; i < NumElements; ++i) {
12612 int M = Mask[i];
12613 if (M < 0)
12614 continue; // Valid anywhere but doesn't tell us anything.
12615 if (i % Scale != 0) {
12616 // Each of the extended elements need to be zeroable.
12617 if (!Zeroable[i])
12618 return SDValue();
12619
12620 // We no longer are in the anyext case.
12621 AnyExt = false;
12622 continue;
12623 }
12624
12625 // Each of the base elements needs to be consecutive indices into the
12626 // same input vector.
12627 SDValue V = M < NumElements ? V1 : V2;
12628 M = M % NumElements;
12629 if (!InputV) {
12630 InputV = V;
12631 Offset = M - (i / Scale);
12632 } else if (InputV != V)
12633 return SDValue(); // Flip-flopping inputs.
12634
12635 // Offset must start in the lowest 128-bit lane or at the start of an
12636 // upper lane.
12637 // FIXME: Is it ever worth allowing a negative base offset?
12638 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12639 (Offset % NumEltsPerLane) == 0))
12640 return SDValue();
12641
12642 // If we are offsetting, all referenced entries must come from the same
12643 // lane.
12644 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12645 return SDValue();
12646
12647 if ((M % NumElements) != (Offset + (i / Scale)))
12648 return SDValue(); // Non-consecutive strided elements.
12649 Matches++;
12650 }
12651
12652 // If we fail to find an input, we have a zero-shuffle which should always
12653 // have already been handled.
12654 // FIXME: Maybe handle this here in case during blending we end up with one?
12655 if (!InputV)
12656 return SDValue();
12657
12658 // If we are offsetting, don't extend if we only match a single input, we
12659 // can always do better by using a basic PSHUF or PUNPCK.
12660 if (Offset != 0 && Matches < 2)
12661 return SDValue();
12662
12663 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12664 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12665 InputV, Mask, Subtarget, DAG);
12666 };
12667
12668 // The widest scale possible for extending is to a 64-bit integer.
12669 assert(Bits % 64 == 0 &&
12670 "The number of bits in a vector must be divisible by 64 on x86!");
12671 int NumExtElements = Bits / 64;
12672
12673 // Each iteration, try extending the elements half as much, but into twice as
12674 // many elements.
12675 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12676 assert(NumElements % NumExtElements == 0 &&
12677 "The input vector size must be divisible by the extended size.");
12678 if (SDValue V = Lower(NumElements / NumExtElements))
12679 return V;
12680 }
12681
12682 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12683 if (Bits != 128)
12684 return SDValue();
12685
12686 // Returns one of the source operands if the shuffle can be reduced to a
12687 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12688 auto CanZExtLowHalf = [&]() {
12689 for (int i = NumElements / 2; i != NumElements; ++i)
12690 if (!Zeroable[i])
12691 return SDValue();
12692 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12693 return V1;
12694 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12695 return V2;
12696 return SDValue();
12697 };
12698
12699 if (SDValue V = CanZExtLowHalf()) {
12700 V = DAG.getBitcast(MVT::v2i64, V);
12701 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12702 return DAG.getBitcast(VT, V);
12703 }
12704
12705 // No viable ext lowering found.
12706 return SDValue();
12707}
12708
12709/// Try to get a scalar value for a specific element of a vector.
12710///
12711/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12713 SelectionDAG &DAG) {
12714 MVT VT = V.getSimpleValueType();
12715 MVT EltVT = VT.getVectorElementType();
12716 V = peekThroughBitcasts(V);
12717
12718 // If the bitcasts shift the element size, we can't extract an equivalent
12719 // element from it.
12720 MVT NewVT = V.getSimpleValueType();
12721 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12722 return SDValue();
12723
12724 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12725 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12726 // Ensure the scalar operand is the same size as the destination.
12727 // FIXME: Add support for scalar truncation where possible.
12728 SDValue S = V.getOperand(Idx);
12729 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12730 return DAG.getBitcast(EltVT, S);
12731 }
12732
12733 return SDValue();
12734}
12735
12736/// Helper to test for a load that can be folded with x86 shuffles.
12737///
12738/// This is particularly important because the set of instructions varies
12739/// significantly based on whether the operand is a load or not.
12741 return V.hasOneUse() &&
12743}
12744
12745template<typename T>
12746static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12747 T EltVT = VT.getScalarType();
12748 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12749 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12750}
12751
12752/// Try to lower insertion of a single element into a zero vector.
12753///
12754/// This is a common pattern that we have especially efficient patterns to lower
12755/// across all subtarget feature sets.
12757 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12758 const APInt &Zeroable, const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 MVT ExtVT = VT;
12761 MVT EltVT = VT.getVectorElementType();
12762 unsigned NumElts = VT.getVectorNumElements();
12763 unsigned EltBits = VT.getScalarSizeInBits();
12764
12765 if (isSoftF16(EltVT, Subtarget))
12766 return SDValue();
12767
12768 int V2Index =
12769 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12770 Mask.begin();
12771 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12772 bool IsV1Zeroable = true;
12773 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12774 if (i != V2Index && !Zeroable[i]) {
12775 IsV1Zeroable = false;
12776 break;
12777 }
12778
12779 // Bail if a non-zero V1 isn't used in place.
12780 if (!IsV1Zeroable) {
12781 SmallVector<int, 8> V1Mask(Mask);
12782 V1Mask[V2Index] = -1;
12783 if (!isNoopShuffleMask(V1Mask))
12784 return SDValue();
12785 }
12786
12787 // Check for a single input from a SCALAR_TO_VECTOR node.
12788 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12789 // all the smarts here sunk into that routine. However, the current
12790 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12791 // vector shuffle lowering is dead.
12792 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12793 DAG);
12794 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12795 // We need to zext the scalar if it is smaller than an i32.
12796 V2S = DAG.getBitcast(EltVT, V2S);
12797 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12798 // Using zext to expand a narrow element won't work for non-zero
12799 // insertions. But we can use a masked constant vector if we're
12800 // inserting V2 into the bottom of V1.
12801 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12802 return SDValue();
12803
12804 // Zero-extend directly to i32.
12805 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12806 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12807
12808 // If we're inserting into a constant, mask off the inserted index
12809 // and OR with the zero-extended scalar.
12810 if (!IsV1Zeroable) {
12811 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12812 Bits[V2Index] = APInt::getZero(EltBits);
12813 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12814 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12815 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12816 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12817 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12818 }
12819 }
12820 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12821 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12822 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12823 // Either not inserting from the low element of the input or the input
12824 // element size is too small to use VZEXT_MOVL to clear the high bits.
12825 return SDValue();
12826 }
12827
12828 if (!IsV1Zeroable) {
12829 // If V1 can't be treated as a zero vector we have fewer options to lower
12830 // this. We can't support integer vectors or non-zero targets cheaply.
12831 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12832 if (!VT.isFloatingPoint() || V2Index != 0)
12833 return SDValue();
12834 if (!VT.is128BitVector())
12835 return SDValue();
12836
12837 // Otherwise, use MOVSD, MOVSS or MOVSH.
12838 unsigned MovOpc = 0;
12839 if (EltVT == MVT::f16)
12840 MovOpc = X86ISD::MOVSH;
12841 else if (EltVT == MVT::f32)
12842 MovOpc = X86ISD::MOVSS;
12843 else if (EltVT == MVT::f64)
12844 MovOpc = X86ISD::MOVSD;
12845 else
12846 llvm_unreachable("Unsupported floating point element type to handle!");
12847 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12848 }
12849
12850 // This lowering only works for the low element with floating point vectors.
12851 if (VT.isFloatingPoint() && V2Index != 0)
12852 return SDValue();
12853
12854 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12855 if (ExtVT != VT)
12856 V2 = DAG.getBitcast(VT, V2);
12857
12858 if (V2Index != 0) {
12859 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12860 // the desired position. Otherwise it is more efficient to do a vector
12861 // shift left. We know that we can do a vector shift left because all
12862 // the inputs are zero.
12863 if (VT.isFloatingPoint() || NumElts <= 4) {
12864 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12865 V2Shuffle[V2Index] = 0;
12866 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12867 } else {
12868 V2 = DAG.getBitcast(MVT::v16i8, V2);
12869 V2 = DAG.getNode(
12870 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12871 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12872 V2 = DAG.getBitcast(VT, V2);
12873 }
12874 }
12875 return V2;
12876}
12877
12878/// Try to lower broadcast of a single - truncated - integer element,
12879/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12880///
12881/// This assumes we have AVX2.
12883 int BroadcastIdx,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(Subtarget.hasAVX2() &&
12887 "We can only lower integer broadcasts with AVX2!");
12888
12889 MVT EltVT = VT.getVectorElementType();
12890 MVT V0VT = V0.getSimpleValueType();
12891
12892 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12893 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12894
12895 MVT V0EltVT = V0VT.getVectorElementType();
12896 if (!V0EltVT.isInteger())
12897 return SDValue();
12898
12899 const unsigned EltSize = EltVT.getSizeInBits();
12900 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12901
12902 // This is only a truncation if the original element type is larger.
12903 if (V0EltSize <= EltSize)
12904 return SDValue();
12905
12906 assert(((V0EltSize % EltSize) == 0) &&
12907 "Scalar type sizes must all be powers of 2 on x86!");
12908
12909 const unsigned V0Opc = V0.getOpcode();
12910 const unsigned Scale = V0EltSize / EltSize;
12911 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12912
12913 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12914 V0Opc != ISD::BUILD_VECTOR)
12915 return SDValue();
12916
12917 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12918
12919 // If we're extracting non-least-significant bits, shift so we can truncate.
12920 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12921 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12922 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12923 if (const int OffsetIdx = BroadcastIdx % Scale)
12924 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12925 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12926
12927 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12928 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12929}
12930
12931/// Test whether this can be lowered with a single SHUFPS instruction.
12932///
12933/// This is used to disable more specialized lowerings when the shufps lowering
12934/// will happen to be efficient.
12936 // This routine only handles 128-bit shufps.
12937 assert(Mask.size() == 4 && "Unsupported mask size!");
12938 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12939 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12940 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12941 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12942
12943 // To lower with a single SHUFPS we need to have the low half and high half
12944 // each requiring a single input.
12945 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12946 return false;
12947 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12948 return false;
12949
12950 return true;
12951}
12952
12953/// Test whether the specified input (0 or 1) is in-place blended by the
12954/// given mask.
12955///
12956/// This returns true if the elements from a particular input are already in the
12957/// slot required by the given mask and require no permutation.
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12963 return false;
12964
12965 return true;
12966}
12967
12968/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12969/// the given mask.
12970///
12972 int BroadcastableElement = 0) {
12973 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12974 int Size = Mask.size();
12975 for (int i = 0; i < Size; ++i)
12976 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12977 Mask[i] % Size != BroadcastableElement)
12978 return false;
12979 return true;
12980}
12981
12982/// If we are extracting two 128-bit halves of a vector and shuffling the
12983/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12984/// multi-shuffle lowering.
12986 SDValue N1, ArrayRef<int> Mask,
12987 SelectionDAG &DAG) {
12988 MVT VT = N0.getSimpleValueType();
12989 assert((VT.is128BitVector() &&
12990 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12991 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12992
12993 // Check that both sources are extracts of the same source vector.
12994 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12996 N0.getOperand(0) != N1.getOperand(0) ||
12997 !N0.hasOneUse() || !N1.hasOneUse())
12998 return SDValue();
12999
13000 SDValue WideVec = N0.getOperand(0);
13001 MVT WideVT = WideVec.getSimpleValueType();
13002 if (!WideVT.is256BitVector())
13003 return SDValue();
13004
13005 // Match extracts of each half of the wide source vector. Commute the shuffle
13006 // if the extract of the low half is N1.
13007 unsigned NumElts = VT.getVectorNumElements();
13008 SmallVector<int, 4> NewMask(Mask);
13009 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13010 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13011 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13013 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13014 return SDValue();
13015
13016 // Final bailout: if the mask is simple, we are better off using an extract
13017 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13018 // because that avoids a constant load from memory.
13019 if (NumElts == 4 &&
13020 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13021 return SDValue();
13022
13023 // Extend the shuffle mask with undef elements.
13024 NewMask.append(NumElts, -1);
13025
13026 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13027 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13028 NewMask);
13029 // This is free: ymm -> xmm.
13030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13031 DAG.getVectorIdxConstant(0, DL));
13032}
13033
13034/// Try to lower broadcast of a single element.
13035///
13036/// For convenience, this code also bundles all of the subtarget feature set
13037/// filtering. While a little annoying to re-dispatch on type here, there isn't
13038/// a convenient way to factor it out.
13040 SDValue V2, ArrayRef<int> Mask,
13041 const X86Subtarget &Subtarget,
13042 SelectionDAG &DAG) {
13043 MVT EltVT = VT.getVectorElementType();
13044 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13045 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13046 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13047 return SDValue();
13048
13049 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13050 // we can only broadcast from a register with AVX2.
13051 unsigned NumEltBits = VT.getScalarSizeInBits();
13052 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13055 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13056
13057 // Check that the mask is a broadcast.
13058 int BroadcastIdx = getSplatIndex(Mask);
13059 if (BroadcastIdx < 0) {
13060 // Check for hidden broadcast.
13061 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13062 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13063 return SDValue();
13064 BroadcastIdx = 0;
13065 }
13066 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13067 "a sorted mask where the broadcast "
13068 "comes from V1.");
13069 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13070
13071 // Go up the chain of (vector) values to find a scalar load that we can
13072 // combine with the broadcast.
13073 // TODO: Combine this logic with findEltLoadSrc() used by
13074 // EltsFromConsecutiveLoads().
13075 int BitOffset = BroadcastIdx * NumEltBits;
13076 SDValue V = V1;
13077 for (;;) {
13078 switch (V.getOpcode()) {
13079 case ISD::BITCAST: {
13080 V = V.getOperand(0);
13081 continue;
13082 }
13083 case ISD::CONCAT_VECTORS: {
13084 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13085 int OpIdx = BitOffset / OpBitWidth;
13086 V = V.getOperand(OpIdx);
13087 BitOffset %= OpBitWidth;
13088 continue;
13089 }
13091 // The extraction index adds to the existing offset.
13092 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13093 unsigned Idx = V.getConstantOperandVal(1);
13094 unsigned BeginOffset = Idx * EltBitWidth;
13095 BitOffset += BeginOffset;
13096 V = V.getOperand(0);
13097 continue;
13098 }
13099 case ISD::INSERT_SUBVECTOR: {
13100 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13101 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13102 int Idx = (int)V.getConstantOperandVal(2);
13103 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13104 int BeginOffset = Idx * EltBitWidth;
13105 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13106 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13107 BitOffset -= BeginOffset;
13108 V = VInner;
13109 } else {
13110 V = VOuter;
13111 }
13112 continue;
13113 }
13114 }
13115 break;
13116 }
13117 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13118 BroadcastIdx = BitOffset / NumEltBits;
13119
13120 // Do we need to bitcast the source to retrieve the original broadcast index?
13121 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13122
13123 // Check if this is a broadcast of a scalar. We special case lowering
13124 // for scalars so that we can more effectively fold with loads.
13125 // If the original value has a larger element type than the shuffle, the
13126 // broadcast element is in essence truncated. Make that explicit to ease
13127 // folding.
13128 if (BitCastSrc && VT.isInteger())
13129 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13130 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13131 return TruncBroadcast;
13132
13133 // Also check the simpler case, where we can directly reuse the scalar.
13134 if (!BitCastSrc &&
13135 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13136 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13137 V = V.getOperand(BroadcastIdx);
13138
13139 // If we can't broadcast from a register, check that the input is a load.
13140 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13141 return SDValue();
13142 } else if (ISD::isNormalLoad(V.getNode()) &&
13143 cast<LoadSDNode>(V)->isSimple()) {
13144 // We do not check for one-use of the vector load because a broadcast load
13145 // is expected to be a win for code size, register pressure, and possibly
13146 // uops even if the original vector load is not eliminated.
13147
13148 // Reduce the vector load and shuffle to a broadcasted scalar load.
13149 auto *Ld = cast<LoadSDNode>(V);
13150 SDValue BaseAddr = Ld->getBasePtr();
13151 MVT SVT = VT.getScalarType();
13152 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13153 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13154 SDValue NewAddr =
13156
13157 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13158 // than MOVDDUP.
13159 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13160 if (Opcode == X86ISD::VBROADCAST) {
13161 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13162 SDValue Ops[] = {Ld->getChain(), NewAddr};
13163 V = DAG.getMemIntrinsicNode(
13164 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13166 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13168 return DAG.getBitcast(VT, V);
13169 }
13170 assert(SVT == MVT::f64 && "Unexpected VT!");
13171 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13173 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13175 } else if (!BroadcastFromReg) {
13176 // We can't broadcast from a vector register.
13177 return SDValue();
13178 } else if (BitOffset != 0) {
13179 // We can only broadcast from the zero-element of a vector register,
13180 // but it can be advantageous to broadcast from the zero-element of a
13181 // subvector.
13182 if (!VT.is256BitVector() && !VT.is512BitVector())
13183 return SDValue();
13184
13185 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13186 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13187 return SDValue();
13188
13189 // If we are broadcasting an element from the lowest 128-bit subvector, try
13190 // to move the element in position.
13191 if (BitOffset < 128 && NumActiveElts > 1 &&
13192 V.getScalarValueSizeInBits() == NumEltBits) {
13193 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13194 "Unexpected bit-offset");
13195 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13196 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13197 V = extractSubVector(V, 0, DAG, DL, 128);
13198 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13199 } else {
13200 // Only broadcast the zero-element of a 128-bit subvector.
13201 if ((BitOffset % 128) != 0)
13202 return SDValue();
13203
13204 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13205 "Unexpected bit-offset");
13206 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13207 "Unexpected vector size");
13208 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13209 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13210 }
13211 }
13212
13213 // On AVX we can use VBROADCAST directly for scalar sources.
13214 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13215 V = DAG.getBitcast(MVT::f64, V);
13216 if (Subtarget.hasAVX()) {
13217 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13218 return DAG.getBitcast(VT, V);
13219 }
13220 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13221 }
13222
13223 // If this is a scalar, do the broadcast on this type and bitcast.
13224 if (!V.getValueType().isVector()) {
13225 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13226 "Unexpected scalar size");
13227 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13229 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13230 }
13231
13232 // We only support broadcasting from 128-bit vectors to minimize the
13233 // number of patterns we need to deal with in isel. So extract down to
13234 // 128-bits, removing as many bitcasts as possible.
13235 if (V.getValueSizeInBits() > 128)
13237
13238 // Otherwise cast V to a vector with the same element type as VT, but
13239 // possibly narrower than VT. Then perform the broadcast.
13240 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13241 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13242 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13243}
13244
13245// Check for whether we can use INSERTPS to perform the shuffle. We only use
13246// INSERTPS when the V1 elements are already in the correct locations
13247// because otherwise we can just always use two SHUFPS instructions which
13248// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13249// perform INSERTPS if a single V1 element is out of place and all V2
13250// elements are zeroable.
13252 unsigned &InsertPSMask,
13253 const APInt &Zeroable,
13254 ArrayRef<int> Mask, SelectionDAG &DAG) {
13255 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13257 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13258
13259 // Attempt to match INSERTPS with one element from VA or VB being
13260 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13261 // are updated.
13262 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13263 ArrayRef<int> CandidateMask) {
13264 unsigned ZMask = 0;
13265 int VADstIndex = -1;
13266 int VBDstIndex = -1;
13267 bool VAUsedInPlace = false;
13268
13269 for (int i = 0; i < 4; ++i) {
13270 // Synthesize a zero mask from the zeroable elements (includes undefs).
13271 if (Zeroable[i]) {
13272 ZMask |= 1 << i;
13273 continue;
13274 }
13275
13276 // Flag if we use any VA inputs in place.
13277 if (i == CandidateMask[i]) {
13278 VAUsedInPlace = true;
13279 continue;
13280 }
13281
13282 // We can only insert a single non-zeroable element.
13283 if (VADstIndex >= 0 || VBDstIndex >= 0)
13284 return false;
13285
13286 if (CandidateMask[i] < 4) {
13287 // VA input out of place for insertion.
13288 VADstIndex = i;
13289 } else {
13290 // VB input for insertion.
13291 VBDstIndex = i;
13292 }
13293 }
13294
13295 // Don't bother if we have no (non-zeroable) element for insertion.
13296 if (VADstIndex < 0 && VBDstIndex < 0)
13297 return false;
13298
13299 // Determine element insertion src/dst indices. The src index is from the
13300 // start of the inserted vector, not the start of the concatenated vector.
13301 unsigned VBSrcIndex = 0;
13302 if (VADstIndex >= 0) {
13303 // If we have a VA input out of place, we use VA as the V2 element
13304 // insertion and don't use the original V2 at all.
13305 VBSrcIndex = CandidateMask[VADstIndex];
13306 VBDstIndex = VADstIndex;
13307 VB = VA;
13308 } else {
13309 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13310 }
13311
13312 // If no V1 inputs are used in place, then the result is created only from
13313 // the zero mask and the V2 insertion - so remove V1 dependency.
13314 if (!VAUsedInPlace)
13315 VA = DAG.getUNDEF(MVT::v4f32);
13316
13317 // Update V1, V2 and InsertPSMask accordingly.
13318 V1 = VA;
13319 V2 = VB;
13320
13321 // Insert the V2 element into the desired position.
13322 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13323 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13324 return true;
13325 };
13326
13327 if (matchAsInsertPS(V1, V2, Mask))
13328 return true;
13329
13330 // Commute and try again.
13331 SmallVector<int, 4> CommutedMask(Mask);
13333 if (matchAsInsertPS(V2, V1, CommutedMask))
13334 return true;
13335
13336 return false;
13337}
13338
13340 ArrayRef<int> Mask, const APInt &Zeroable,
13341 SelectionDAG &DAG) {
13342 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13344
13345 // Attempt to match the insertps pattern.
13346 unsigned InsertPSMask = 0;
13347 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13348 return SDValue();
13349
13350 // Insert the V2 element into the desired position.
13351 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13352 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13353}
13354
13355/// Handle lowering of 2-lane 64-bit floating point shuffles.
13356///
13357/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13358/// support for floating point shuffles but not integer shuffles. These
13359/// instructions will incur a domain crossing penalty on some chips though so
13360/// it is better to avoid lowering through this for integer vectors where
13361/// possible.
13363 const APInt &Zeroable, SDValue V1, SDValue V2,
13364 const X86Subtarget &Subtarget,
13365 SelectionDAG &DAG) {
13366 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13368 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13369
13370 if (V2.isUndef()) {
13371 // Check for being able to broadcast a single element.
13372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13373 Mask, Subtarget, DAG))
13374 return Broadcast;
13375
13376 // Straight shuffle of a single input vector. Simulate this by using the
13377 // single input as both of the "inputs" to this instruction..
13378 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13379
13380 if (Subtarget.hasAVX()) {
13381 // If we have AVX, we can use VPERMILPS which will allow folding a load
13382 // into the shuffle.
13383 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13385 }
13386
13387 return DAG.getNode(
13388 X86ISD::SHUFP, DL, MVT::v2f64,
13389 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13391 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13392 }
13393 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13395 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13396 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13397
13398 if (Subtarget.hasAVX2())
13399 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13400 return Extract;
13401
13402 // When loading a scalar and then shuffling it into a vector we can often do
13403 // the insertion cheaply.
13405 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13406 return Insertion;
13407 // Try inverting the insertion since for v2 masks it is easy to do and we
13408 // can't reliably sort the mask one way or the other.
13409 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13410 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13412 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // Try to use one of the special instruction patterns to handle two common
13416 // blend patterns if a zero-blend above didn't work.
13417 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13418 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13419 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13420 // We can either use a special instruction to load over the low double or
13421 // to move just the low double.
13422 return DAG.getNode(
13423 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13424 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13425
13426 if (Subtarget.hasSSE41())
13427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13429 return Blend;
13430
13431 // Use dedicated unpack instructions for masks that match their pattern.
13432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13433 return V;
13434
13435 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13436 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13438}
13439
13440/// Handle lowering of 2-lane 64-bit integer shuffles.
13441///
13442/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13443/// the integer unit to minimize domain crossing penalties. However, for blends
13444/// it falls back to the floating point shuffle operation with appropriate bit
13445/// casting.
13447 const APInt &Zeroable, SDValue V1, SDValue V2,
13448 const X86Subtarget &Subtarget,
13449 SelectionDAG &DAG) {
13450 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13452 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13453
13454 if (V2.isUndef()) {
13455 // Check for being able to broadcast a single element.
13456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13457 Mask, Subtarget, DAG))
13458 return Broadcast;
13459
13460 // Straight shuffle of a single input vector. For everything from SSE2
13461 // onward this has a single fast instruction with no scary immediates.
13462 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13463 V1 = DAG.getBitcast(MVT::v4i32, V1);
13464 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13465 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13466 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13467 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13468 return DAG.getBitcast(
13469 MVT::v2i64,
13470 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13471 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13472 }
13473 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13475 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13476 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13477
13478 if (Subtarget.hasAVX2())
13479 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13480 return Extract;
13481
13482 // Try to use shift instructions.
13483 if (SDValue Shift =
13484 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13485 DAG, /*BitwiseOnly*/ false))
13486 return Shift;
13487
13488 // When loading a scalar and then shuffling it into a vector we can often do
13489 // the insertion cheaply.
13491 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13492 return Insertion;
13493 // Try inverting the insertion since for v2 masks it is easy to do and we
13494 // can't reliably sort the mask one way or the other.
13495 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13497 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499
13500 // We have different paths for blend lowering, but they all must use the
13501 // *exact* same predicate.
13502 bool IsBlendSupported = Subtarget.hasSSE41();
13503 if (IsBlendSupported)
13504 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13505 Zeroable, Subtarget, DAG))
13506 return Blend;
13507
13508 // Use dedicated unpack instructions for masks that match their pattern.
13509 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13510 return V;
13511
13512 // Try to use byte rotation instructions.
13513 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13514 if (Subtarget.hasSSSE3()) {
13515 if (Subtarget.hasVLX())
13516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13517 Zeroable, Subtarget, DAG))
13518 return Rotate;
13519
13520 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13521 Subtarget, DAG))
13522 return Rotate;
13523 }
13524
13525 // If we have direct support for blends, we should lower by decomposing into
13526 // a permute. That will be faster than the domain cross.
13527 if (IsBlendSupported)
13528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13529 Zeroable, Subtarget, DAG);
13530
13531 // We implement this with SHUFPD which is pretty lame because it will likely
13532 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13533 // However, all the alternatives are still more cycles and newer chips don't
13534 // have this problem. It would be really nice if x86 had better shuffles here.
13535 V1 = DAG.getBitcast(MVT::v2f64, V1);
13536 V2 = DAG.getBitcast(MVT::v2f64, V2);
13537 return DAG.getBitcast(MVT::v2i64,
13538 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13539}
13540
13541/// Lower a vector shuffle using the SHUFPS instruction.
13542///
13543/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13544/// It makes no assumptions about whether this is the *best* lowering, it simply
13545/// uses it.
13547 ArrayRef<int> Mask, SDValue V1,
13548 SDValue V2, SelectionDAG &DAG) {
13549 SDValue LowV = V1, HighV = V2;
13550 SmallVector<int, 4> NewMask(Mask);
13551 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13552
13553 if (NumV2Elements == 1) {
13554 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13555
13556 // Compute the index adjacent to V2Index and in the same half by toggling
13557 // the low bit.
13558 int V2AdjIndex = V2Index ^ 1;
13559
13560 if (Mask[V2AdjIndex] < 0) {
13561 // Handles all the cases where we have a single V2 element and an undef.
13562 // This will only ever happen in the high lanes because we commute the
13563 // vector otherwise.
13564 if (V2Index < 2)
13565 std::swap(LowV, HighV);
13566 NewMask[V2Index] -= 4;
13567 } else {
13568 // Handle the case where the V2 element ends up adjacent to a V1 element.
13569 // To make this work, blend them together as the first step.
13570 int V1Index = V2AdjIndex;
13571 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13572 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13573 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13574
13575 // Now proceed to reconstruct the final blend as we have the necessary
13576 // high or low half formed.
13577 if (V2Index < 2) {
13578 LowV = V2;
13579 HighV = V1;
13580 } else {
13581 HighV = V2;
13582 }
13583 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13584 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13585 }
13586 } else if (NumV2Elements == 2) {
13587 if (Mask[0] < 4 && Mask[1] < 4) {
13588 // Handle the easy case where we have V1 in the low lanes and V2 in the
13589 // high lanes.
13590 NewMask[2] -= 4;
13591 NewMask[3] -= 4;
13592 } else if (Mask[2] < 4 && Mask[3] < 4) {
13593 // We also handle the reversed case because this utility may get called
13594 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13595 // arrange things in the right direction.
13596 NewMask[0] -= 4;
13597 NewMask[1] -= 4;
13598 HighV = V1;
13599 LowV = V2;
13600 } else {
13601 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13602 // trying to place elements directly, just blend them and set up the final
13603 // shuffle to place them.
13604
13605 // The first two blend mask elements are for V1, the second two are for
13606 // V2.
13607 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13608 Mask[2] < 4 ? Mask[2] : Mask[3],
13609 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13610 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13611 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13612 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13613
13614 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13615 // a blend.
13616 LowV = HighV = V1;
13617 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13618 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13619 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13620 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13621 }
13622 } else if (NumV2Elements == 3) {
13623 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13624 // we can get here due to other paths (e.g repeated mask matching) that we
13625 // don't want to do another round of lowerVECTOR_SHUFFLE.
13627 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13628 }
13629 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13630 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13631}
13632
13633/// Lower 4-lane 32-bit floating point shuffles.
13634///
13635/// Uses instructions exclusively from the floating point unit to minimize
13636/// domain crossing penalties, as these are sufficient to implement all v4f32
13637/// shuffles.
13639 const APInt &Zeroable, SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13645
13646 if (Subtarget.hasSSE41())
13647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13648 Zeroable, Subtarget, DAG))
13649 return Blend;
13650
13651 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13652
13653 if (NumV2Elements == 0) {
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13656 Mask, Subtarget, DAG))
13657 return Broadcast;
13658
13659 // Use even/odd duplicate instructions for masks that match their pattern.
13660 if (Subtarget.hasSSE3()) {
13661 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13662 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13663 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13665 }
13666
13667 if (Subtarget.hasAVX()) {
13668 // If we have AVX, we can use VPERMILPS which will allow folding a load
13669 // into the shuffle.
13670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13671 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13672 }
13673
13674 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13675 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13676 if (!Subtarget.hasSSE2()) {
13677 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13678 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13679 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13680 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13681 }
13682
13683 // Otherwise, use a straight shuffle of a single input vector. We pass the
13684 // input vector to both operands to simulate this with a SHUFPS.
13685 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13687 }
13688
13689 if (Subtarget.hasSSE2())
13691 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13692 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13693 return ZExt;
13694 }
13695
13696 if (Subtarget.hasAVX2())
13697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13698 return Extract;
13699
13700 // There are special ways we can lower some single-element blends. However, we
13701 // have custom ways we can lower more complex single-element blends below that
13702 // we defer to if both this and BLENDPS fail to match, so restrict this to
13703 // when the V2 input is targeting element 0 of the mask -- that is the fast
13704 // case here.
13705 if (NumV2Elements == 1 && Mask[0] >= 4)
13707 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13708 return V;
13709
13710 if (Subtarget.hasSSE41()) {
13711 // Use INSERTPS if we can complete the shuffle efficiently.
13712 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13713 return V;
13714
13715 if (!isSingleSHUFPSMask(Mask))
13716 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13717 V2, Mask, DAG))
13718 return BlendPerm;
13719 }
13720
13721 // Use low/high mov instructions. These are only valid in SSE1 because
13722 // otherwise they are widened to v2f64 and never get here.
13723 if (!Subtarget.hasSSE2()) {
13724 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13725 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13726 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13727 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13728 }
13729
13730 // Use dedicated unpack instructions for masks that match their pattern.
13731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13732 return V;
13733
13734 // Otherwise fall back to a SHUFPS lowering strategy.
13735 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13736}
13737
13738/// Lower 4-lane i32 vector shuffles.
13739///
13740/// We try to handle these with integer-domain shuffles where we can, but for
13741/// blends we use the floating point domain blend instructions.
13743 const APInt &Zeroable, SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749
13750 // Whenever we can lower this as a zext, that instruction is strictly faster
13751 // than any alternative. It also allows us to fold memory operands into the
13752 // shuffle in many cases.
13753 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13754 Zeroable, Subtarget, DAG))
13755 return ZExt;
13756
13757 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13758
13759 // Try to use shift instructions if fast.
13760 if (Subtarget.preferLowerShuffleAsShift()) {
13761 if (SDValue Shift =
13762 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13763 Subtarget, DAG, /*BitwiseOnly*/ true))
13764 return Shift;
13765 if (NumV2Elements == 0)
13766 if (SDValue Rotate =
13767 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13768 return Rotate;
13769 }
13770
13771 if (NumV2Elements == 0) {
13772 // Try to use broadcast unless the mask only has one non-undef element.
13773 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13774 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13775 Mask, Subtarget, DAG))
13776 return Broadcast;
13777 }
13778
13779 // Straight shuffle of a single input vector. For everything from SSE2
13780 // onward this has a single fast instruction with no scary immediates.
13781 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13782 // but we aren't actually going to use the UNPCK instruction because doing
13783 // so prevents folding a load into this instruction or making a copy.
13784 const int UnpackLoMask[] = {0, 0, 1, 1};
13785 const int UnpackHiMask[] = {2, 2, 3, 3};
13786 if (!isSingleElementRepeatedMask(Mask)) {
13787 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13788 Mask = UnpackLoMask;
13789 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13790 Mask = UnpackHiMask;
13791 }
13792
13793 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13794 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13795 }
13796
13797 if (Subtarget.hasAVX2())
13798 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13799 return Extract;
13800
13801 // Try to use shift instructions.
13802 if (SDValue Shift =
13803 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13804 DAG, /*BitwiseOnly*/ false))
13805 return Shift;
13806
13807 // There are special ways we can lower some single-element blends.
13808 if (NumV2Elements == 1)
13810 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13811 return V;
13812
13813 // We have different paths for blend lowering, but they all must use the
13814 // *exact* same predicate.
13815 bool IsBlendSupported = Subtarget.hasSSE41();
13816 if (IsBlendSupported)
13817 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13818 Zeroable, Subtarget, DAG))
13819 return Blend;
13820
13821 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13822 Zeroable, Subtarget, DAG))
13823 return Masked;
13824
13825 // Use dedicated unpack instructions for masks that match their pattern.
13826 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13827 return V;
13828
13829 // Try to use byte rotation instructions.
13830 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13831 if (Subtarget.hasSSSE3()) {
13832 if (Subtarget.hasVLX())
13833 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13834 Zeroable, Subtarget, DAG))
13835 return Rotate;
13836
13837 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13838 Subtarget, DAG))
13839 return Rotate;
13840 }
13841
13842 // Assume that a single SHUFPS is faster than an alternative sequence of
13843 // multiple instructions (even if the CPU has a domain penalty).
13844 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13845 if (!isSingleSHUFPSMask(Mask)) {
13846 // If we have direct support for blends, we should lower by decomposing into
13847 // a permute. That will be faster than the domain cross.
13848 if (IsBlendSupported)
13849 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13850 Zeroable, Subtarget, DAG);
13851
13852 // Try to lower by permuting the inputs into an unpack instruction.
13853 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13854 Mask, Subtarget, DAG))
13855 return Unpack;
13856 }
13857
13858 // We implement this with SHUFPS because it can blend from two vectors.
13859 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13860 // up the inputs, bypassing domain shift penalties that we would incur if we
13861 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13862 // relevant.
13863 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13864 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13865 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13866 return DAG.getBitcast(MVT::v4i32, ShufPS);
13867}
13868
13869/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13870/// shuffle lowering, and the most complex part.
13871///
13872/// The lowering strategy is to try to form pairs of input lanes which are
13873/// targeted at the same half of the final vector, and then use a dword shuffle
13874/// to place them onto the right half, and finally unpack the paired lanes into
13875/// their final position.
13876///
13877/// The exact breakdown of how to form these dword pairs and align them on the
13878/// correct sides is really tricky. See the comments within the function for
13879/// more of the details.
13880///
13881/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13882/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13883/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13884/// vector, form the analogous 128-bit 8-element Mask.
13886 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13887 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13888 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13889 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13890
13891 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13892 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13893 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13894
13895 // Attempt to directly match PSHUFLW or PSHUFHW.
13896 if (isUndefOrInRange(LoMask, 0, 4) &&
13897 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13898 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13899 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13900 }
13901 if (isUndefOrInRange(HiMask, 4, 8) &&
13902 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13903 for (int i = 0; i != 4; ++i)
13904 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13905 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13906 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13907 }
13908
13909 SmallVector<int, 4> LoInputs;
13910 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13911 array_pod_sort(LoInputs.begin(), LoInputs.end());
13912 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13913 SmallVector<int, 4> HiInputs;
13914 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13915 array_pod_sort(HiInputs.begin(), HiInputs.end());
13916 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13917 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13918 int NumHToL = LoInputs.size() - NumLToL;
13919 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13920 int NumHToH = HiInputs.size() - NumLToH;
13921 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13922 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13923 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13924 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13925
13926 // If we are shuffling values from one half - check how many different DWORD
13927 // pairs we need to create. If only 1 or 2 then we can perform this as a
13928 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13929 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13930 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13931 V = DAG.getNode(ShufWOp, DL, VT, V,
13932 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13933 V = DAG.getBitcast(PSHUFDVT, V);
13934 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13935 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13936 return DAG.getBitcast(VT, V);
13937 };
13938
13939 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13940 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13941 SmallVector<std::pair<int, int>, 4> DWordPairs;
13942 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13943
13944 // Collect the different DWORD pairs.
13945 for (int DWord = 0; DWord != 4; ++DWord) {
13946 int M0 = Mask[2 * DWord + 0];
13947 int M1 = Mask[2 * DWord + 1];
13948 M0 = (M0 >= 0 ? M0 % 4 : M0);
13949 M1 = (M1 >= 0 ? M1 % 4 : M1);
13950 if (M0 < 0 && M1 < 0)
13951 continue;
13952
13953 bool Match = false;
13954 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13955 auto &DWordPair = DWordPairs[j];
13956 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13957 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13958 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13959 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13960 PSHUFDMask[DWord] = DOffset + j;
13961 Match = true;
13962 break;
13963 }
13964 }
13965 if (!Match) {
13966 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13967 DWordPairs.push_back(std::make_pair(M0, M1));
13968 }
13969 }
13970
13971 if (DWordPairs.size() <= 2) {
13972 DWordPairs.resize(2, std::make_pair(-1, -1));
13973 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13974 DWordPairs[1].first, DWordPairs[1].second};
13975 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13976 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13977 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13978 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13979 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13980 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13981 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13982 }
13983 if ((NumHToL + NumHToH) == 0)
13984 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13985 if ((NumLToL + NumLToH) == 0)
13986 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13987 }
13988 }
13989
13990 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13991 // such inputs we can swap two of the dwords across the half mark and end up
13992 // with <=2 inputs to each half in each half. Once there, we can fall through
13993 // to the generic code below. For example:
13994 //
13995 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13996 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13997 //
13998 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13999 // and an existing 2-into-2 on the other half. In this case we may have to
14000 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14001 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14002 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14003 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14004 // half than the one we target for fixing) will be fixed when we re-enter this
14005 // path. We will also combine away any sequence of PSHUFD instructions that
14006 // result into a single instruction. Here is an example of the tricky case:
14007 //
14008 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14009 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14010 //
14011 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14012 //
14013 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14014 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14015 //
14016 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14017 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14018 //
14019 // The result is fine to be handled by the generic logic.
14020 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14021 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14022 int AOffset, int BOffset) {
14023 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14024 "Must call this with A having 3 or 1 inputs from the A half.");
14025 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14026 "Must call this with B having 1 or 3 inputs from the B half.");
14027 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14028 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14029
14030 bool ThreeAInputs = AToAInputs.size() == 3;
14031
14032 // Compute the index of dword with only one word among the three inputs in
14033 // a half by taking the sum of the half with three inputs and subtracting
14034 // the sum of the actual three inputs. The difference is the remaining
14035 // slot.
14036 int ADWord = 0, BDWord = 0;
14037 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14038 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14039 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14040 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14041 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14042 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14043 int TripleNonInputIdx =
14044 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14045 TripleDWord = TripleNonInputIdx / 2;
14046
14047 // We use xor with one to compute the adjacent DWord to whichever one the
14048 // OneInput is in.
14049 OneInputDWord = (OneInput / 2) ^ 1;
14050
14051 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14052 // and BToA inputs. If there is also such a problem with the BToB and AToB
14053 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14054 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14055 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14056 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14057 // Compute how many inputs will be flipped by swapping these DWords. We
14058 // need
14059 // to balance this to ensure we don't form a 3-1 shuffle in the other
14060 // half.
14061 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14062 llvm::count(AToBInputs, 2 * ADWord + 1);
14063 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14064 llvm::count(BToBInputs, 2 * BDWord + 1);
14065 if ((NumFlippedAToBInputs == 1 &&
14066 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14067 (NumFlippedBToBInputs == 1 &&
14068 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14069 // We choose whether to fix the A half or B half based on whether that
14070 // half has zero flipped inputs. At zero, we may not be able to fix it
14071 // with that half. We also bias towards fixing the B half because that
14072 // will more commonly be the high half, and we have to bias one way.
14073 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14074 ArrayRef<int> Inputs) {
14075 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14076 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14077 // Determine whether the free index is in the flipped dword or the
14078 // unflipped dword based on where the pinned index is. We use this bit
14079 // in an xor to conditionally select the adjacent dword.
14080 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14081 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14082 if (IsFixIdxInput == IsFixFreeIdxInput)
14083 FixFreeIdx += 1;
14084 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14085 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14086 "We need to be changing the number of flipped inputs!");
14087 int PSHUFHalfMask[] = {0, 1, 2, 3};
14088 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14089 V = DAG.getNode(
14090 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14091 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14092 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14093
14094 for (int &M : Mask)
14095 if (M >= 0 && M == FixIdx)
14096 M = FixFreeIdx;
14097 else if (M >= 0 && M == FixFreeIdx)
14098 M = FixIdx;
14099 };
14100 if (NumFlippedBToBInputs != 0) {
14101 int BPinnedIdx =
14102 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14103 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14104 } else {
14105 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14106 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14107 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14108 }
14109 }
14110 }
14111
14112 int PSHUFDMask[] = {0, 1, 2, 3};
14113 PSHUFDMask[ADWord] = BDWord;
14114 PSHUFDMask[BDWord] = ADWord;
14115 V = DAG.getBitcast(
14116 VT,
14117 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14118 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14119
14120 // Adjust the mask to match the new locations of A and B.
14121 for (int &M : Mask)
14122 if (M >= 0 && M/2 == ADWord)
14123 M = 2 * BDWord + M % 2;
14124 else if (M >= 0 && M/2 == BDWord)
14125 M = 2 * ADWord + M % 2;
14126
14127 // Recurse back into this routine to re-compute state now that this isn't
14128 // a 3 and 1 problem.
14129 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14130 };
14131 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14132 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14133 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14134 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14135
14136 // At this point there are at most two inputs to the low and high halves from
14137 // each half. That means the inputs can always be grouped into dwords and
14138 // those dwords can then be moved to the correct half with a dword shuffle.
14139 // We use at most one low and one high word shuffle to collect these paired
14140 // inputs into dwords, and finally a dword shuffle to place them.
14141 int PSHUFLMask[4] = {-1, -1, -1, -1};
14142 int PSHUFHMask[4] = {-1, -1, -1, -1};
14143 int PSHUFDMask[4] = {-1, -1, -1, -1};
14144
14145 // First fix the masks for all the inputs that are staying in their
14146 // original halves. This will then dictate the targets of the cross-half
14147 // shuffles.
14148 auto fixInPlaceInputs =
14149 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14150 MutableArrayRef<int> SourceHalfMask,
14151 MutableArrayRef<int> HalfMask, int HalfOffset) {
14152 if (InPlaceInputs.empty())
14153 return;
14154 if (InPlaceInputs.size() == 1) {
14155 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14156 InPlaceInputs[0] - HalfOffset;
14157 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14158 return;
14159 }
14160 if (IncomingInputs.empty()) {
14161 // Just fix all of the in place inputs.
14162 for (int Input : InPlaceInputs) {
14163 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14164 PSHUFDMask[Input / 2] = Input / 2;
14165 }
14166 return;
14167 }
14168
14169 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14170 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14171 InPlaceInputs[0] - HalfOffset;
14172 // Put the second input next to the first so that they are packed into
14173 // a dword. We find the adjacent index by toggling the low bit.
14174 int AdjIndex = InPlaceInputs[0] ^ 1;
14175 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14176 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14177 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14178 };
14179 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14180 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14181
14182 // Now gather the cross-half inputs and place them into a free dword of
14183 // their target half.
14184 // FIXME: This operation could almost certainly be simplified dramatically to
14185 // look more like the 3-1 fixing operation.
14186 auto moveInputsToRightHalf = [&PSHUFDMask](
14187 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14188 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14189 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14190 int DestOffset) {
14191 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14192 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14193 };
14194 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14195 int Word) {
14196 int LowWord = Word & ~1;
14197 int HighWord = Word | 1;
14198 return isWordClobbered(SourceHalfMask, LowWord) ||
14199 isWordClobbered(SourceHalfMask, HighWord);
14200 };
14201
14202 if (IncomingInputs.empty())
14203 return;
14204
14205 if (ExistingInputs.empty()) {
14206 // Map any dwords with inputs from them into the right half.
14207 for (int Input : IncomingInputs) {
14208 // If the source half mask maps over the inputs, turn those into
14209 // swaps and use the swapped lane.
14210 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14211 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14212 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14213 Input - SourceOffset;
14214 // We have to swap the uses in our half mask in one sweep.
14215 for (int &M : HalfMask)
14216 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14217 M = Input;
14218 else if (M == Input)
14219 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14220 } else {
14221 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14222 Input - SourceOffset &&
14223 "Previous placement doesn't match!");
14224 }
14225 // Note that this correctly re-maps both when we do a swap and when
14226 // we observe the other side of the swap above. We rely on that to
14227 // avoid swapping the members of the input list directly.
14228 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14229 }
14230
14231 // Map the input's dword into the correct half.
14232 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14233 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14234 else
14235 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14236 Input / 2 &&
14237 "Previous placement doesn't match!");
14238 }
14239
14240 // And just directly shift any other-half mask elements to be same-half
14241 // as we will have mirrored the dword containing the element into the
14242 // same position within that half.
14243 for (int &M : HalfMask)
14244 if (M >= SourceOffset && M < SourceOffset + 4) {
14245 M = M - SourceOffset + DestOffset;
14246 assert(M >= 0 && "This should never wrap below zero!");
14247 }
14248 return;
14249 }
14250
14251 // Ensure we have the input in a viable dword of its current half. This
14252 // is particularly tricky because the original position may be clobbered
14253 // by inputs being moved and *staying* in that half.
14254 if (IncomingInputs.size() == 1) {
14255 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14256 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14257 SourceOffset;
14258 SourceHalfMask[InputFixed - SourceOffset] =
14259 IncomingInputs[0] - SourceOffset;
14260 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14261 IncomingInputs[0] = InputFixed;
14262 }
14263 } else if (IncomingInputs.size() == 2) {
14264 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14265 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14266 // We have two non-adjacent or clobbered inputs we need to extract from
14267 // the source half. To do this, we need to map them into some adjacent
14268 // dword slot in the source mask.
14269 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14270 IncomingInputs[1] - SourceOffset};
14271
14272 // If there is a free slot in the source half mask adjacent to one of
14273 // the inputs, place the other input in it. We use (Index XOR 1) to
14274 // compute an adjacent index.
14275 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14276 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14277 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14278 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14279 InputsFixed[1] = InputsFixed[0] ^ 1;
14280 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14281 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14282 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14283 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14284 InputsFixed[0] = InputsFixed[1] ^ 1;
14285 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14286 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14287 // The two inputs are in the same DWord but it is clobbered and the
14288 // adjacent DWord isn't used at all. Move both inputs to the free
14289 // slot.
14290 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14291 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14292 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14293 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14294 } else {
14295 // The only way we hit this point is if there is no clobbering
14296 // (because there are no off-half inputs to this half) and there is no
14297 // free slot adjacent to one of the inputs. In this case, we have to
14298 // swap an input with a non-input.
14299 for (int i = 0; i < 4; ++i)
14300 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14301 "We can't handle any clobbers here!");
14302 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14303 "Cannot have adjacent inputs here!");
14304
14305 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14306 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14307
14308 // We also have to update the final source mask in this case because
14309 // it may need to undo the above swap.
14310 for (int &M : FinalSourceHalfMask)
14311 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14312 M = InputsFixed[1] + SourceOffset;
14313 else if (M == InputsFixed[1] + SourceOffset)
14314 M = (InputsFixed[0] ^ 1) + SourceOffset;
14315
14316 InputsFixed[1] = InputsFixed[0] ^ 1;
14317 }
14318
14319 // Point everything at the fixed inputs.
14320 for (int &M : HalfMask)
14321 if (M == IncomingInputs[0])
14322 M = InputsFixed[0] + SourceOffset;
14323 else if (M == IncomingInputs[1])
14324 M = InputsFixed[1] + SourceOffset;
14325
14326 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14327 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14328 }
14329 } else {
14330 llvm_unreachable("Unhandled input size!");
14331 }
14332
14333 // Now hoist the DWord down to the right half.
14334 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14335 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14336 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14337 for (int &M : HalfMask)
14338 for (int Input : IncomingInputs)
14339 if (M == Input)
14340 M = FreeDWord * 2 + Input % 2;
14341 };
14342 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14343 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14344 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14345 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14346
14347 // Now enact all the shuffles we've computed to move the inputs into their
14348 // target half.
14349 if (!isNoopShuffleMask(PSHUFLMask))
14350 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14351 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14352 if (!isNoopShuffleMask(PSHUFHMask))
14353 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14354 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14355 if (!isNoopShuffleMask(PSHUFDMask))
14356 V = DAG.getBitcast(
14357 VT,
14358 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14359 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14360
14361 // At this point, each half should contain all its inputs, and we can then
14362 // just shuffle them into their final position.
14363 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14364 "Failed to lift all the high half inputs to the low mask!");
14365 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14366 "Failed to lift all the low half inputs to the high mask!");
14367
14368 // Do a half shuffle for the low mask.
14369 if (!isNoopShuffleMask(LoMask))
14370 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14371 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14372
14373 // Do a half shuffle with the high mask after shifting its values down.
14374 for (int &M : HiMask)
14375 if (M >= 0)
14376 M -= 4;
14377 if (!isNoopShuffleMask(HiMask))
14378 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14379 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14380
14381 return V;
14382}
14383
14384/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14385/// blend if only one input is used.
14387 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14388 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14390 "Lane crossing shuffle masks not supported");
14391
14392 int NumBytes = VT.getSizeInBits() / 8;
14393 int Size = Mask.size();
14394 int Scale = NumBytes / Size;
14395
14396 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14397 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14398 V1InUse = false;
14399 V2InUse = false;
14400
14401 for (int i = 0; i < NumBytes; ++i) {
14402 int M = Mask[i / Scale];
14403 if (M < 0)
14404 continue;
14405
14406 const int ZeroMask = 0x80;
14407 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14408 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14409 if (Zeroable[i / Scale])
14410 V1Idx = V2Idx = ZeroMask;
14411
14412 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14413 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14414 V1InUse |= (ZeroMask != V1Idx);
14415 V2InUse |= (ZeroMask != V2Idx);
14416 }
14417
14418 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14419 if (V1InUse)
14420 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14421 DAG.getBuildVector(ShufVT, DL, V1Mask));
14422 if (V2InUse)
14423 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14424 DAG.getBuildVector(ShufVT, DL, V2Mask));
14425
14426 // If we need shuffled inputs from both, blend the two.
14427 SDValue V;
14428 if (V1InUse && V2InUse)
14429 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14430 else
14431 V = V1InUse ? V1 : V2;
14432
14433 // Cast the result back to the correct type.
14434 return DAG.getBitcast(VT, V);
14435}
14436
14437/// Generic lowering of 8-lane i16 shuffles.
14438///
14439/// This handles both single-input shuffles and combined shuffle/blends with
14440/// two inputs. The single input shuffles are immediately delegated to
14441/// a dedicated lowering routine.
14442///
14443/// The blends are lowered in one of three fundamental ways. If there are few
14444/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14445/// of the input is significantly cheaper when lowered as an interleaving of
14446/// the two inputs, try to interleave them. Otherwise, blend the low and high
14447/// halves of the inputs separately (making them have relatively few inputs)
14448/// and then concatenate them.
14450 const APInt &Zeroable, SDValue V1, SDValue V2,
14451 const X86Subtarget &Subtarget,
14452 SelectionDAG &DAG) {
14453 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14454 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14455 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14456
14457 // Whenever we can lower this as a zext, that instruction is strictly faster
14458 // than any alternative.
14459 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14460 Zeroable, Subtarget, DAG))
14461 return ZExt;
14462
14463 // Try to use lower using a truncation.
14464 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14465 Subtarget, DAG))
14466 return V;
14467
14468 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14469
14470 if (NumV2Inputs == 0) {
14471 // Try to use shift instructions.
14472 if (SDValue Shift =
14473 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14474 Subtarget, DAG, /*BitwiseOnly*/ false))
14475 return Shift;
14476
14477 // Check for being able to broadcast a single element.
14478 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14479 Mask, Subtarget, DAG))
14480 return Broadcast;
14481
14482 // Try to use bit rotation instructions.
14483 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14484 Subtarget, DAG))
14485 return Rotate;
14486
14487 // Use dedicated unpack instructions for masks that match their pattern.
14488 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14489 return V;
14490
14491 // Use dedicated pack instructions for masks that match their pattern.
14492 if (SDValue V =
14493 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14494 return V;
14495
14496 // Try to use byte rotation instructions.
14497 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14498 Subtarget, DAG))
14499 return Rotate;
14500
14501 // Make a copy of the mask so it can be modified.
14502 SmallVector<int, 8> MutableMask(Mask);
14503 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14504 Subtarget, DAG);
14505 }
14506
14507 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14508 "All single-input shuffles should be canonicalized to be V1-input "
14509 "shuffles.");
14510
14511 // Try to use shift instructions.
14512 if (SDValue Shift =
14513 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14514 DAG, /*BitwiseOnly*/ false))
14515 return Shift;
14516
14517 // See if we can use SSE4A Extraction / Insertion.
14518 if (Subtarget.hasSSE4A())
14519 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14520 Zeroable, DAG))
14521 return V;
14522
14523 // There are special ways we can lower some single-element blends.
14524 if (NumV2Inputs == 1)
14526 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14527 return V;
14528
14529 // We have different paths for blend lowering, but they all must use the
14530 // *exact* same predicate.
14531 bool IsBlendSupported = Subtarget.hasSSE41();
14532 if (IsBlendSupported)
14533 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14534 Zeroable, Subtarget, DAG))
14535 return Blend;
14536
14537 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14538 Zeroable, Subtarget, DAG))
14539 return Masked;
14540
14541 // Use dedicated unpack instructions for masks that match their pattern.
14542 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14543 return V;
14544
14545 // Use dedicated pack instructions for masks that match their pattern.
14546 if (SDValue V =
14547 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14548 return V;
14549
14550 // Try to use lower using a truncation.
14551 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14552 Subtarget, DAG))
14553 return V;
14554
14555 // Try to use byte rotation instructions.
14556 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14557 Subtarget, DAG))
14558 return Rotate;
14559
14560 if (SDValue BitBlend =
14561 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14562 return BitBlend;
14563
14564 // Try to use byte shift instructions to mask.
14565 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14566 Zeroable, Subtarget, DAG))
14567 return V;
14568
14569 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14570 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14571 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14572 !Subtarget.hasVLX()) {
14573 // Check if this is part of a 256-bit vector truncation.
14574 unsigned PackOpc = 0;
14575 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14578 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14579 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14580 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14581 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14582 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14583 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14584 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14585 PackOpc = X86ISD::PACKUS;
14586 } else if (Subtarget.hasSSE41()) {
14587 SmallVector<SDValue, 4> DWordClearOps(4,
14588 DAG.getConstant(0, DL, MVT::i32));
14589 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14590 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14591 SDValue DWordClearMask =
14592 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14593 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14594 DWordClearMask);
14595 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14596 DWordClearMask);
14597 PackOpc = X86ISD::PACKUS;
14598 } else if (!Subtarget.hasSSSE3()) {
14599 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14600 V1 = DAG.getBitcast(MVT::v4i32, V1);
14601 V2 = DAG.getBitcast(MVT::v4i32, V2);
14602 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14603 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14604 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14605 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14606 PackOpc = X86ISD::PACKSS;
14607 }
14608 if (PackOpc) {
14609 // Now pack things back together.
14610 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14611 if (NumEvenDrops == 2) {
14612 Result = DAG.getBitcast(MVT::v4i32, Result);
14613 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14614 }
14615 return Result;
14616 }
14617 }
14618
14619 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14620 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14621 if (NumOddDrops == 1) {
14622 bool HasSSE41 = Subtarget.hasSSE41();
14623 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14624 DAG.getBitcast(MVT::v4i32, V1),
14625 DAG.getTargetConstant(16, DL, MVT::i8));
14626 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14627 DAG.getBitcast(MVT::v4i32, V2),
14628 DAG.getTargetConstant(16, DL, MVT::i8));
14629 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14630 MVT::v8i16, V1, V2);
14631 }
14632
14633 // Try to lower by permuting the inputs into an unpack instruction.
14634 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14635 Mask, Subtarget, DAG))
14636 return Unpack;
14637
14638 // If we can't directly blend but can use PSHUFB, that will be better as it
14639 // can both shuffle and set up the inefficient blend.
14640 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14641 bool V1InUse, V2InUse;
14642 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14643 Zeroable, DAG, V1InUse, V2InUse);
14644 }
14645
14646 // We can always bit-blend if we have to so the fallback strategy is to
14647 // decompose into single-input permutes and blends/unpacks.
14648 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14649 Zeroable, Subtarget, DAG);
14650}
14651
14652/// Lower 8-lane 16-bit floating point shuffles.
14654 const APInt &Zeroable, SDValue V1, SDValue V2,
14655 const X86Subtarget &Subtarget,
14656 SelectionDAG &DAG) {
14657 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14658 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14659 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14660 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14661
14662 if (Subtarget.hasFP16()) {
14663 if (NumV2Elements == 0) {
14664 // Check for being able to broadcast a single element.
14665 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14666 Mask, Subtarget, DAG))
14667 return Broadcast;
14668 }
14669 if (NumV2Elements == 1 && Mask[0] >= 8)
14671 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14672 return V;
14673 }
14674
14675 V1 = DAG.getBitcast(MVT::v8i16, V1);
14676 V2 = DAG.getBitcast(MVT::v8i16, V2);
14677 return DAG.getBitcast(MVT::v8f16,
14678 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14679}
14680
14681// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14682// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14683// the active subvector is extracted.
14685 ArrayRef<int> OriginalMask, SDValue V1,
14686 SDValue V2, const X86Subtarget &Subtarget,
14687 SelectionDAG &DAG) {
14688 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14689 SmallVector<int, 32> Mask(OriginalMask);
14690 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14691 !isShuffleFoldableLoad(V2)) {
14693 std::swap(V1, V2);
14694 }
14695
14696 MVT MaskVT = VT.changeTypeToInteger();
14697 SDValue MaskNode;
14698 MVT ShuffleVT = VT;
14699 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14700 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14701 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14702 ShuffleVT = V1.getSimpleValueType();
14703
14704 // Adjust mask to correct indices for the second input.
14705 int NumElts = VT.getVectorNumElements();
14706 unsigned Scale = 512 / VT.getSizeInBits();
14707 SmallVector<int, 32> AdjustedMask(Mask);
14708 for (int &M : AdjustedMask)
14709 if (NumElts <= M)
14710 M += (Scale - 1) * NumElts;
14711 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14712 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14713 } else {
14714 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14715 }
14716
14717 SDValue Result;
14718 if (V2.isUndef())
14719 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14720 else
14721 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14722
14723 if (VT != ShuffleVT)
14724 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14725
14726 return Result;
14727}
14728
14729/// Generic lowering of v16i8 shuffles.
14730///
14731/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14732/// detect any complexity reducing interleaving. If that doesn't help, it uses
14733/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14734/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14735/// back together.
14737 const APInt &Zeroable, SDValue V1, SDValue V2,
14738 const X86Subtarget &Subtarget,
14739 SelectionDAG &DAG) {
14740 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14741 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14742 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14743
14744 // Try to use shift instructions.
14745 if (SDValue Shift =
14746 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14747 DAG, /*BitwiseOnly*/ false))
14748 return Shift;
14749
14750 // Try to use byte rotation instructions.
14751 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14752 Subtarget, DAG))
14753 return Rotate;
14754
14755 // Use dedicated pack instructions for masks that match their pattern.
14756 if (SDValue V =
14757 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14758 return V;
14759
14760 // Try to use a zext lowering.
14761 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14762 Zeroable, Subtarget, DAG))
14763 return ZExt;
14764
14765 // Try to use lower using a truncation.
14766 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14767 Subtarget, DAG))
14768 return V;
14769
14770 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14771 Subtarget, DAG))
14772 return V;
14773
14774 // See if we can use SSE4A Extraction / Insertion.
14775 if (Subtarget.hasSSE4A())
14776 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14777 Zeroable, DAG))
14778 return V;
14779
14780 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14781
14782 // For single-input shuffles, there are some nicer lowering tricks we can use.
14783 if (NumV2Elements == 0) {
14784 // Check for being able to broadcast a single element.
14785 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14786 Mask, Subtarget, DAG))
14787 return Broadcast;
14788
14789 // Try to use bit rotation instructions.
14790 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14791 Subtarget, DAG))
14792 return Rotate;
14793
14794 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14795 return V;
14796
14797 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14798 // Notably, this handles splat and partial-splat shuffles more efficiently.
14799 // However, it only makes sense if the pre-duplication shuffle simplifies
14800 // things significantly. Currently, this means we need to be able to
14801 // express the pre-duplication shuffle as an i16 shuffle.
14802 //
14803 // FIXME: We should check for other patterns which can be widened into an
14804 // i16 shuffle as well.
14805 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14806 for (int i = 0; i < 16; i += 2)
14807 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14808 return false;
14809
14810 return true;
14811 };
14812 auto tryToWidenViaDuplication = [&]() -> SDValue {
14813 if (!canWidenViaDuplication(Mask))
14814 return SDValue();
14815 SmallVector<int, 4> LoInputs;
14816 copy_if(Mask, std::back_inserter(LoInputs),
14817 [](int M) { return M >= 0 && M < 8; });
14818 array_pod_sort(LoInputs.begin(), LoInputs.end());
14819 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14820 SmallVector<int, 4> HiInputs;
14821 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14822 array_pod_sort(HiInputs.begin(), HiInputs.end());
14823 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14824
14825 bool TargetLo = LoInputs.size() >= HiInputs.size();
14826 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14827 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14828
14829 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14831 for (int I : InPlaceInputs) {
14832 PreDupI16Shuffle[I/2] = I/2;
14833 LaneMap[I] = I;
14834 }
14835 int j = TargetLo ? 0 : 4, je = j + 4;
14836 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14837 // Check if j is already a shuffle of this input. This happens when
14838 // there are two adjacent bytes after we move the low one.
14839 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14840 // If we haven't yet mapped the input, search for a slot into which
14841 // we can map it.
14842 while (j < je && PreDupI16Shuffle[j] >= 0)
14843 ++j;
14844
14845 if (j == je)
14846 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14847 return SDValue();
14848
14849 // Map this input with the i16 shuffle.
14850 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14851 }
14852
14853 // Update the lane map based on the mapping we ended up with.
14854 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14855 }
14856 V1 = DAG.getBitcast(
14857 MVT::v16i8,
14858 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14859 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14860
14861 // Unpack the bytes to form the i16s that will be shuffled into place.
14862 bool EvenInUse = false, OddInUse = false;
14863 for (int i = 0; i < 16; i += 2) {
14864 EvenInUse |= (Mask[i + 0] >= 0);
14865 OddInUse |= (Mask[i + 1] >= 0);
14866 if (EvenInUse && OddInUse)
14867 break;
14868 }
14869 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14870 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14871 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14872
14873 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14874 for (int i = 0; i < 16; ++i)
14875 if (Mask[i] >= 0) {
14876 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14877 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14878 if (PostDupI16Shuffle[i / 2] < 0)
14879 PostDupI16Shuffle[i / 2] = MappedMask;
14880 else
14881 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14882 "Conflicting entries in the original shuffle!");
14883 }
14884 return DAG.getBitcast(
14885 MVT::v16i8,
14886 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14887 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14888 };
14889 if (SDValue V = tryToWidenViaDuplication())
14890 return V;
14891 }
14892
14893 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14894 Zeroable, Subtarget, DAG))
14895 return Masked;
14896
14897 // Use dedicated unpack instructions for masks that match their pattern.
14898 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14899 return V;
14900
14901 // Try to use byte shift instructions to mask.
14902 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14903 Zeroable, Subtarget, DAG))
14904 return V;
14905
14906 // Check for compaction patterns.
14907 bool IsSingleInput = V2.isUndef();
14908 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14909
14910 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14911 // with PSHUFB. It is important to do this before we attempt to generate any
14912 // blends but after all of the single-input lowerings. If the single input
14913 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14914 // want to preserve that and we can DAG combine any longer sequences into
14915 // a PSHUFB in the end. But once we start blending from multiple inputs,
14916 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14917 // and there are *very* few patterns that would actually be faster than the
14918 // PSHUFB approach because of its ability to zero lanes.
14919 //
14920 // If the mask is a binary compaction, we can more efficiently perform this
14921 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14922 //
14923 // FIXME: The only exceptions to the above are blends which are exact
14924 // interleavings with direct instructions supporting them. We currently don't
14925 // handle those well here.
14926 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14927 bool V1InUse = false;
14928 bool V2InUse = false;
14929
14931 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14932
14933 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14934 // do so. This avoids using them to handle blends-with-zero which is
14935 // important as a single pshufb is significantly faster for that.
14936 if (V1InUse && V2InUse) {
14937 if (Subtarget.hasSSE41())
14938 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14939 Zeroable, Subtarget, DAG))
14940 return Blend;
14941
14942 // We can use an unpack to do the blending rather than an or in some
14943 // cases. Even though the or may be (very minorly) more efficient, we
14944 // preference this lowering because there are common cases where part of
14945 // the complexity of the shuffles goes away when we do the final blend as
14946 // an unpack.
14947 // FIXME: It might be worth trying to detect if the unpack-feeding
14948 // shuffles will both be pshufb, in which case we shouldn't bother with
14949 // this.
14951 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14952 return Unpack;
14953
14954 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14955 if (Subtarget.hasVBMI())
14956 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14957 DAG);
14958
14959 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14960 if (Subtarget.hasXOP()) {
14961 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14962 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14963 }
14964
14965 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14966 // PALIGNR will be cheaper than the second PSHUFB+OR.
14968 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14969 return V;
14970 }
14971
14972 return PSHUFB;
14973 }
14974
14975 // There are special ways we can lower some single-element blends.
14976 if (NumV2Elements == 1)
14978 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14979 return V;
14980
14981 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14982 return Blend;
14983
14984 // Check whether a compaction lowering can be done. This handles shuffles
14985 // which take every Nth element for some even N. See the helper function for
14986 // details.
14987 //
14988 // We special case these as they can be particularly efficiently handled with
14989 // the PACKUSB instruction on x86 and they show up in common patterns of
14990 // rearranging bytes to truncate wide elements.
14991 if (NumEvenDrops) {
14992 // NumEvenDrops is the power of two stride of the elements. Another way of
14993 // thinking about it is that we need to drop the even elements this many
14994 // times to get the original input.
14995
14996 // First we need to zero all the dropped bytes.
14997 assert(NumEvenDrops <= 3 &&
14998 "No support for dropping even elements more than 3 times.");
14999 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15000 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15001 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15002 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15003 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15004 WordClearMask);
15005 if (!IsSingleInput)
15006 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15007 WordClearMask);
15008
15009 // Now pack things back together.
15010 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15011 IsSingleInput ? V1 : V2);
15012 for (int i = 1; i < NumEvenDrops; ++i) {
15013 Result = DAG.getBitcast(MVT::v8i16, Result);
15014 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15015 }
15016 return Result;
15017 }
15018
15019 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15020 if (NumOddDrops == 1) {
15021 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15022 DAG.getBitcast(MVT::v8i16, V1),
15023 DAG.getTargetConstant(8, DL, MVT::i8));
15024 if (!IsSingleInput)
15025 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15026 DAG.getBitcast(MVT::v8i16, V2),
15027 DAG.getTargetConstant(8, DL, MVT::i8));
15028 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15029 IsSingleInput ? V1 : V2);
15030 }
15031
15032 // Handle multi-input cases by blending/unpacking single-input shuffles.
15033 if (NumV2Elements > 0)
15034 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15035 Zeroable, Subtarget, DAG);
15036
15037 // The fallback path for single-input shuffles widens this into two v8i16
15038 // vectors with unpacks, shuffles those, and then pulls them back together
15039 // with a pack.
15040 SDValue V = V1;
15041
15042 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15043 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15044 for (int i = 0; i < 16; ++i)
15045 if (Mask[i] >= 0)
15046 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15047
15048 SDValue VLoHalf, VHiHalf;
15049 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15050 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15051 // i16s.
15052 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15053 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15054 // Use a mask to drop the high bytes.
15055 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15056 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15057 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15058
15059 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15060 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15061
15062 // Squash the masks to point directly into VLoHalf.
15063 for (int &M : LoBlendMask)
15064 if (M >= 0)
15065 M /= 2;
15066 for (int &M : HiBlendMask)
15067 if (M >= 0)
15068 M /= 2;
15069 } else {
15070 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15071 // VHiHalf so that we can blend them as i16s.
15072 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15073
15074 VLoHalf = DAG.getBitcast(
15075 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15076 VHiHalf = DAG.getBitcast(
15077 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15078 }
15079
15080 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15081 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15082
15083 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15084}
15085
15086/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15087///
15088/// This routine breaks down the specific type of 128-bit shuffle and
15089/// dispatches to the lowering routines accordingly.
15091 MVT VT, SDValue V1, SDValue V2,
15092 const APInt &Zeroable,
15093 const X86Subtarget &Subtarget,
15094 SelectionDAG &DAG) {
15095 if (VT == MVT::v8bf16) {
15096 V1 = DAG.getBitcast(MVT::v8i16, V1);
15097 V2 = DAG.getBitcast(MVT::v8i16, V2);
15098 return DAG.getBitcast(VT,
15099 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15100 }
15101
15102 switch (VT.SimpleTy) {
15103 case MVT::v2i64:
15104 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v2f64:
15106 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v4i32:
15108 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v4f32:
15110 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v8i16:
15112 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v8f16:
15114 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115 case MVT::v16i8:
15116 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15117
15118 default:
15119 llvm_unreachable("Unimplemented!");
15120 }
15121}
15122
15123/// Generic routine to split vector shuffle into half-sized shuffles.
15124///
15125/// This routine just extracts two subvectors, shuffles them independently, and
15126/// then concatenates them back together. This should work effectively with all
15127/// AVX vector shuffle types.
15129 SDValue V2, ArrayRef<int> Mask,
15130 SelectionDAG &DAG, bool SimpleOnly) {
15131 assert(VT.getSizeInBits() >= 256 &&
15132 "Only for 256-bit or wider vector shuffles!");
15133 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15134 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15135
15136 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15137 if (VT == MVT::v8f32) {
15138 SDValue BC1 = peekThroughBitcasts(V1);
15139 SDValue BC2 = peekThroughBitcasts(V2);
15140 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15141 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15142 DAG, SimpleOnly))
15143 return DAG.getBitcast(VT, Split);
15144 }
15145 }
15146
15147 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15148 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15149
15150 int NumElements = VT.getVectorNumElements();
15151 int SplitNumElements = NumElements / 2;
15152 MVT ScalarVT = VT.getVectorElementType();
15153 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15154
15155 // Use splitVector/extractSubVector so that split build-vectors just build two
15156 // narrower build vectors. This helps shuffling with splats and zeros.
15157 auto SplitVector = [&](SDValue V) {
15158 SDValue LoV, HiV;
15159 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15160 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15161 DAG.getBitcast(SplitVT, HiV));
15162 };
15163
15164 SDValue LoV1, HiV1, LoV2, HiV2;
15165 std::tie(LoV1, HiV1) = SplitVector(V1);
15166 std::tie(LoV2, HiV2) = SplitVector(V2);
15167
15168 // Now create two 4-way blends of these half-width vectors.
15169 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15170 bool &UseHiV1, bool &UseLoV2,
15171 bool &UseHiV2) {
15172 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15173 for (int i = 0; i < SplitNumElements; ++i) {
15174 int M = HalfMask[i];
15175 if (M >= NumElements) {
15176 if (M >= NumElements + SplitNumElements)
15177 UseHiV2 = true;
15178 else
15179 UseLoV2 = true;
15180 } else if (M >= 0) {
15181 if (M >= SplitNumElements)
15182 UseHiV1 = true;
15183 else
15184 UseLoV1 = true;
15185 }
15186 }
15187 };
15188
15189 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15190 if (!SimpleOnly)
15191 return true;
15192
15193 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15194 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15195
15196 return !(UseHiV1 || UseHiV2);
15197 };
15198
15199 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15200 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15201 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15202 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15203 for (int i = 0; i < SplitNumElements; ++i) {
15204 int M = HalfMask[i];
15205 if (M >= NumElements) {
15206 V2BlendMask[i] = M - NumElements;
15207 BlendMask[i] = SplitNumElements + i;
15208 } else if (M >= 0) {
15209 V1BlendMask[i] = M;
15210 BlendMask[i] = i;
15211 }
15212 }
15213
15214 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15215 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15216
15217 // Because the lowering happens after all combining takes place, we need to
15218 // manually combine these blend masks as much as possible so that we create
15219 // a minimal number of high-level vector shuffle nodes.
15220 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15221
15222 // First try just blending the halves of V1 or V2.
15223 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15224 return DAG.getUNDEF(SplitVT);
15225 if (!UseLoV2 && !UseHiV2)
15226 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15227 if (!UseLoV1 && !UseHiV1)
15228 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15229
15230 SDValue V1Blend, V2Blend;
15231 if (UseLoV1 && UseHiV1) {
15232 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15233 } else {
15234 // We only use half of V1 so map the usage down into the final blend mask.
15235 V1Blend = UseLoV1 ? LoV1 : HiV1;
15236 for (int i = 0; i < SplitNumElements; ++i)
15237 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15238 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15239 }
15240 if (UseLoV2 && UseHiV2) {
15241 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15242 } else {
15243 // We only use half of V2 so map the usage down into the final blend mask.
15244 V2Blend = UseLoV2 ? LoV2 : HiV2;
15245 for (int i = 0; i < SplitNumElements; ++i)
15246 if (BlendMask[i] >= SplitNumElements)
15247 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15248 }
15249 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15250 };
15251
15252 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15253 return SDValue();
15254
15255 SDValue Lo = HalfBlend(LoMask);
15256 SDValue Hi = HalfBlend(HiMask);
15257 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15258}
15259
15260/// Either split a vector in halves or decompose the shuffles and the
15261/// blend/unpack.
15262///
15263/// This is provided as a good fallback for many lowerings of non-single-input
15264/// shuffles with more than one 128-bit lane. In those cases, we want to select
15265/// between splitting the shuffle into 128-bit components and stitching those
15266/// back together vs. extracting the single-input shuffles and blending those
15267/// results.
15269 SDValue V2, ArrayRef<int> Mask,
15270 const APInt &Zeroable,
15271 const X86Subtarget &Subtarget,
15272 SelectionDAG &DAG) {
15273 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15274 "shuffles as it could then recurse on itself.");
15275 int Size = Mask.size();
15276
15277 // If this can be modeled as a broadcast of two elements followed by a blend,
15278 // prefer that lowering. This is especially important because broadcasts can
15279 // often fold with memory operands.
15280 auto DoBothBroadcast = [&] {
15281 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15282 for (int M : Mask)
15283 if (M >= Size) {
15284 if (V2BroadcastIdx < 0)
15285 V2BroadcastIdx = M - Size;
15286 else if ((M - Size) != V2BroadcastIdx &&
15287 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15288 return false;
15289 } else if (M >= 0) {
15290 if (V1BroadcastIdx < 0)
15291 V1BroadcastIdx = M;
15292 else if (M != V1BroadcastIdx &&
15293 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15294 return false;
15295 }
15296 return true;
15297 };
15298 if (DoBothBroadcast())
15299 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15300 Subtarget, DAG);
15301
15302 // If the inputs all stem from a single 128-bit lane of each input, then we
15303 // split them rather than blending because the split will decompose to
15304 // unusually few instructions.
15305 int LaneCount = VT.getSizeInBits() / 128;
15306 int LaneSize = Size / LaneCount;
15307 SmallBitVector LaneInputs[2];
15308 LaneInputs[0].resize(LaneCount, false);
15309 LaneInputs[1].resize(LaneCount, false);
15310 for (int i = 0; i < Size; ++i)
15311 if (Mask[i] >= 0)
15312 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15313 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15314 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15315 /*SimpleOnly*/ false);
15316
15317 // Without AVX2, if we can freely split the subvectors then we're better off
15318 // performing half width shuffles.
15319 if (!Subtarget.hasAVX2()) {
15320 SDValue BC1 = peekThroughBitcasts(V1);
15321 SDValue BC2 = peekThroughBitcasts(V2);
15322 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15323 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15324 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15325 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15326 if (SplatOrSplitV1 && SplatOrSplitV2)
15327 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15328 /*SimpleOnly*/ false);
15329 }
15330
15331 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15332 // requires that the decomposed single-input shuffles don't end up here.
15333 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15334 Subtarget, DAG);
15335}
15336
15337// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15338// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15340 SDValue V1, SDValue V2,
15341 ArrayRef<int> Mask,
15342 SelectionDAG &DAG) {
15343 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15344
15345 int LHSMask[4] = {-1, -1, -1, -1};
15346 int RHSMask[4] = {-1, -1, -1, -1};
15347 int SHUFPDMask[4] = {-1, -1, -1, -1};
15348
15349 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15350 // perform the shuffle once the lanes have been shuffled in place.
15351 for (int i = 0; i != 4; ++i) {
15352 int M = Mask[i];
15353 if (M < 0)
15354 continue;
15355 int LaneBase = i & ~1;
15356 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15357 LaneMask[LaneBase + (M & 1)] = M;
15358 SHUFPDMask[i] = M & 1;
15359 }
15360
15361 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15362 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15363 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15364 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15365}
15366
15367/// Lower a vector shuffle crossing multiple 128-bit lanes as
15368/// a lane permutation followed by a per-lane permutation.
15369///
15370/// This is mainly for cases where we can have non-repeating permutes
15371/// in each lane.
15372///
15373/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15374/// we should investigate merging them.
15376 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15377 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15378 int NumElts = VT.getVectorNumElements();
15379 int NumLanes = VT.getSizeInBits() / 128;
15380 int NumEltsPerLane = NumElts / NumLanes;
15381 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15382
15383 /// Attempts to find a sublane permute with the given size
15384 /// that gets all elements into their target lanes.
15385 ///
15386 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15387 /// If unsuccessful, returns false and may overwrite InLaneMask.
15388 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15389 int NumSublanesPerLane = NumSublanes / NumLanes;
15390 int NumEltsPerSublane = NumElts / NumSublanes;
15391
15392 SmallVector<int, 16> CrossLaneMask;
15393 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15394 // CrossLaneMask but one entry == one sublane.
15395 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15396 APInt DemandedCrossLane = APInt::getZero(NumElts);
15397
15398 for (int i = 0; i != NumElts; ++i) {
15399 int M = Mask[i];
15400 if (M < 0)
15401 continue;
15402
15403 int SrcSublane = M / NumEltsPerSublane;
15404 int DstLane = i / NumEltsPerLane;
15405
15406 // We only need to get the elements into the right lane, not sublane.
15407 // So search all sublanes that make up the destination lane.
15408 bool Found = false;
15409 int DstSubStart = DstLane * NumSublanesPerLane;
15410 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15411 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15412 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15413 continue;
15414
15415 Found = true;
15416 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15417 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15418 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15419 DemandedCrossLane.setBit(InLaneMask[i]);
15420 break;
15421 }
15422 if (!Found)
15423 return SDValue();
15424 }
15425
15426 // Fill CrossLaneMask using CrossLaneMaskLarge.
15427 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15428
15429 if (!CanUseSublanes) {
15430 // If we're only shuffling a single lowest lane and the rest are identity
15431 // then don't bother.
15432 // TODO - isShuffleMaskInputInPlace could be extended to something like
15433 // this.
15434 int NumIdentityLanes = 0;
15435 bool OnlyShuffleLowestLane = true;
15436 for (int i = 0; i != NumLanes; ++i) {
15437 int LaneOffset = i * NumEltsPerLane;
15438 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15439 i * NumEltsPerLane))
15440 NumIdentityLanes++;
15441 else if (CrossLaneMask[LaneOffset] != 0)
15442 OnlyShuffleLowestLane = false;
15443 }
15444 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15445 return SDValue();
15446 }
15447
15448 // Simplify CrossLaneMask based on the actual demanded elements.
15449 if (V1.hasOneUse())
15450 for (int i = 0; i != NumElts; ++i)
15451 if (!DemandedCrossLane[i])
15452 CrossLaneMask[i] = SM_SentinelUndef;
15453
15454 // Avoid returning the same shuffle operation. For example,
15455 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15456 // undef:v16i16
15457 if (CrossLaneMask == Mask || InLaneMask == Mask)
15458 return SDValue();
15459
15460 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15461 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15462 InLaneMask);
15463 };
15464
15465 // First attempt a solution with full lanes.
15466 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15467 return V;
15468
15469 // The rest of the solutions use sublanes.
15470 if (!CanUseSublanes)
15471 return SDValue();
15472
15473 // Then attempt a solution with 64-bit sublanes (vpermq).
15474 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15475 return V;
15476
15477 // If that doesn't work and we have fast variable cross-lane shuffle,
15478 // attempt 32-bit sublanes (vpermd).
15479 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15480 return SDValue();
15481
15482 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15483}
15484
15485/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15486static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15487 SmallVector<int> &InLaneMask) {
15488 int Size = Mask.size();
15489 InLaneMask.assign(Mask.begin(), Mask.end());
15490 for (int i = 0; i < Size; ++i) {
15491 int &M = InLaneMask[i];
15492 if (M < 0)
15493 continue;
15494 if (((M % Size) / LaneSize) != (i / LaneSize))
15495 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15496 }
15497}
15498
15499/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15500/// source with a lane permutation.
15501///
15502/// This lowering strategy results in four instructions in the worst case for a
15503/// single-input cross lane shuffle which is lower than any other fully general
15504/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15505/// shuffle pattern should be handled prior to trying this lowering.
15507 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15508 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15509 // FIXME: This should probably be generalized for 512-bit vectors as well.
15510 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15511 int Size = Mask.size();
15512 int LaneSize = Size / 2;
15513
15514 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15515 // Only do this if the elements aren't all from the lower lane,
15516 // otherwise we're (probably) better off doing a split.
15517 if (VT == MVT::v4f64 &&
15518 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15519 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15520
15521 // If there are only inputs from one 128-bit lane, splitting will in fact be
15522 // less expensive. The flags track whether the given lane contains an element
15523 // that crosses to another lane.
15524 bool AllLanes;
15525 if (!Subtarget.hasAVX2()) {
15526 bool LaneCrossing[2] = {false, false};
15527 for (int i = 0; i < Size; ++i)
15528 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15529 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15530 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15531 } else {
15532 bool LaneUsed[2] = {false, false};
15533 for (int i = 0; i < Size; ++i)
15534 if (Mask[i] >= 0)
15535 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15536 AllLanes = LaneUsed[0] && LaneUsed[1];
15537 }
15538
15539 // TODO - we could support shuffling V2 in the Flipped input.
15540 assert(V2.isUndef() &&
15541 "This last part of this routine only works on single input shuffles");
15542
15543 SmallVector<int> InLaneMask;
15544 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15545
15546 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15547 "In-lane shuffle mask expected");
15548
15549 // If we're not using both lanes in each lane and the inlane mask is not
15550 // repeating, then we're better off splitting.
15551 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15552 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15553 /*SimpleOnly*/ false);
15554
15555 // Flip the lanes, and shuffle the results which should now be in-lane.
15556 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15557 SDValue Flipped = DAG.getBitcast(PVT, V1);
15558 Flipped =
15559 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15560 Flipped = DAG.getBitcast(VT, Flipped);
15561 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15562}
15563
15564/// Handle lowering 2-lane 128-bit shuffles.
15566 SDValue V2, ArrayRef<int> Mask,
15567 const APInt &Zeroable,
15568 const X86Subtarget &Subtarget,
15569 SelectionDAG &DAG) {
15570 if (V2.isUndef()) {
15571 // Attempt to match VBROADCAST*128 subvector broadcast load.
15572 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15573 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15574 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15576 MVT MemVT = VT.getHalfNumVectorElementsVT();
15577 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15580 VT, MemVT, Ld, Ofs, DAG))
15581 return BcstLd;
15582 }
15583
15584 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15585 if (Subtarget.hasAVX2())
15586 return SDValue();
15587 }
15588
15589 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15590
15591 SmallVector<int, 4> WidenedMask;
15592 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15593 return SDValue();
15594
15595 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15596 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15597
15598 // Try to use an insert into a zero vector.
15599 if (WidenedMask[0] == 0 && IsHighZero) {
15600 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15601 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15602 DAG.getVectorIdxConstant(0, DL));
15603 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15604 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15605 DAG.getVectorIdxConstant(0, DL));
15606 }
15607
15608 // TODO: If minimizing size and one of the inputs is a zero vector and the
15609 // the zero vector has only one use, we could use a VPERM2X128 to save the
15610 // instruction bytes needed to explicitly generate the zero vector.
15611
15612 // Blends are faster and handle all the non-lane-crossing cases.
15613 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15614 Subtarget, DAG))
15615 return Blend;
15616
15617 // If either input operand is a zero vector, use VPERM2X128 because its mask
15618 // allows us to replace the zero input with an implicit zero.
15619 if (!IsLowZero && !IsHighZero) {
15620 // Check for patterns which can be matched with a single insert of a 128-bit
15621 // subvector.
15622 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15623 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15624
15625 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15626 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15628 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15629 SDValue SubVec =
15630 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15631 DAG.getVectorIdxConstant(0, DL));
15632 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15633 DAG.getVectorIdxConstant(2, DL));
15634 }
15635 }
15636
15637 // Try to use SHUF128 if possible.
15638 if (Subtarget.hasVLX()) {
15639 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15640 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15641 ((WidenedMask[1] % 2) << 1);
15642 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15643 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15644 }
15645 }
15646 }
15647
15648 // Otherwise form a 128-bit permutation. After accounting for undefs,
15649 // convert the 64-bit shuffle mask selection values into 128-bit
15650 // selection bits by dividing the indexes by 2 and shifting into positions
15651 // defined by a vperm2*128 instruction's immediate control byte.
15652
15653 // The immediate permute control byte looks like this:
15654 // [1:0] - select 128 bits from sources for low half of destination
15655 // [2] - ignore
15656 // [3] - zero low half of destination
15657 // [5:4] - select 128 bits from sources for high half of destination
15658 // [6] - ignore
15659 // [7] - zero high half of destination
15660
15661 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15662 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15663
15664 unsigned PermMask = 0;
15665 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15666 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15667
15668 // Check the immediate mask and replace unused sources with undef.
15669 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15670 V1 = DAG.getUNDEF(VT);
15671 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15672 V2 = DAG.getUNDEF(VT);
15673
15674 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15675 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15676}
15677
15678/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15679/// shuffling each lane.
15680///
15681/// This attempts to create a repeated lane shuffle where each lane uses one
15682/// or two of the lanes of the inputs. The lanes of the input vectors are
15683/// shuffled in one or two independent shuffles to get the lanes into the
15684/// position needed by the final shuffle.
15686 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15687 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15688 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15689
15690 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15691 return SDValue();
15692
15693 int NumElts = Mask.size();
15694 int NumLanes = VT.getSizeInBits() / 128;
15695 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15696 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15697 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15698
15699 // First pass will try to fill in the RepeatMask from lanes that need two
15700 // sources.
15701 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15702 int Srcs[2] = {-1, -1};
15703 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15704 for (int i = 0; i != NumLaneElts; ++i) {
15705 int M = Mask[(Lane * NumLaneElts) + i];
15706 if (M < 0)
15707 continue;
15708 // Determine which of the possible input lanes (NumLanes from each source)
15709 // this element comes from. Assign that as one of the sources for this
15710 // lane. We can assign up to 2 sources for this lane. If we run out
15711 // sources we can't do anything.
15712 int LaneSrc = M / NumLaneElts;
15713 int Src;
15714 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15715 Src = 0;
15716 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15717 Src = 1;
15718 else
15719 return SDValue();
15720
15721 Srcs[Src] = LaneSrc;
15722 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15723 }
15724
15725 // If this lane has two sources, see if it fits with the repeat mask so far.
15726 if (Srcs[1] < 0)
15727 continue;
15728
15729 LaneSrcs[Lane][0] = Srcs[0];
15730 LaneSrcs[Lane][1] = Srcs[1];
15731
15732 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15733 assert(M1.size() == M2.size() && "Unexpected mask size");
15734 for (int i = 0, e = M1.size(); i != e; ++i)
15735 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15736 return false;
15737 return true;
15738 };
15739
15740 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15741 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15742 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15743 int M = Mask[i];
15744 if (M < 0)
15745 continue;
15746 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15747 "Unexpected mask element");
15748 MergedMask[i] = M;
15749 }
15750 };
15751
15752 if (MatchMasks(InLaneMask, RepeatMask)) {
15753 // Merge this lane mask into the final repeat mask.
15754 MergeMasks(InLaneMask, RepeatMask);
15755 continue;
15756 }
15757
15758 // Didn't find a match. Swap the operands and try again.
15759 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15761
15762 if (MatchMasks(InLaneMask, RepeatMask)) {
15763 // Merge this lane mask into the final repeat mask.
15764 MergeMasks(InLaneMask, RepeatMask);
15765 continue;
15766 }
15767
15768 // Couldn't find a match with the operands in either order.
15769 return SDValue();
15770 }
15771
15772 // Now handle any lanes with only one source.
15773 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15774 // If this lane has already been processed, skip it.
15775 if (LaneSrcs[Lane][0] >= 0)
15776 continue;
15777
15778 for (int i = 0; i != NumLaneElts; ++i) {
15779 int M = Mask[(Lane * NumLaneElts) + i];
15780 if (M < 0)
15781 continue;
15782
15783 // If RepeatMask isn't defined yet we can define it ourself.
15784 if (RepeatMask[i] < 0)
15785 RepeatMask[i] = M % NumLaneElts;
15786
15787 if (RepeatMask[i] < NumElts) {
15788 if (RepeatMask[i] != M % NumLaneElts)
15789 return SDValue();
15790 LaneSrcs[Lane][0] = M / NumLaneElts;
15791 } else {
15792 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15793 return SDValue();
15794 LaneSrcs[Lane][1] = M / NumLaneElts;
15795 }
15796 }
15797
15798 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15799 return SDValue();
15800 }
15801
15802 SmallVector<int, 16> NewMask(NumElts, -1);
15803 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15804 int Src = LaneSrcs[Lane][0];
15805 for (int i = 0; i != NumLaneElts; ++i) {
15806 int M = -1;
15807 if (Src >= 0)
15808 M = Src * NumLaneElts + i;
15809 NewMask[Lane * NumLaneElts + i] = M;
15810 }
15811 }
15812 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15813 // Ensure we didn't get back the shuffle we started with.
15814 // FIXME: This is a hack to make up for some splat handling code in
15815 // getVectorShuffle.
15816 if (isa<ShuffleVectorSDNode>(NewV1) &&
15817 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15818 return SDValue();
15819
15820 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15821 int Src = LaneSrcs[Lane][1];
15822 for (int i = 0; i != NumLaneElts; ++i) {
15823 int M = -1;
15824 if (Src >= 0)
15825 M = Src * NumLaneElts + i;
15826 NewMask[Lane * NumLaneElts + i] = M;
15827 }
15828 }
15829 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15830 // Ensure we didn't get back the shuffle we started with.
15831 // FIXME: This is a hack to make up for some splat handling code in
15832 // getVectorShuffle.
15833 if (isa<ShuffleVectorSDNode>(NewV2) &&
15834 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15835 return SDValue();
15836
15837 for (int i = 0; i != NumElts; ++i) {
15838 if (Mask[i] < 0) {
15839 NewMask[i] = -1;
15840 continue;
15841 }
15842 NewMask[i] = RepeatMask[i % NumLaneElts];
15843 if (NewMask[i] < 0)
15844 continue;
15845
15846 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15847 }
15848 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15849}
15850
15851/// If the input shuffle mask results in a vector that is undefined in all upper
15852/// or lower half elements and that mask accesses only 2 halves of the
15853/// shuffle's operands, return true. A mask of half the width with mask indexes
15854/// adjusted to access the extracted halves of the original shuffle operands is
15855/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15856/// lower half of each input operand is accessed.
15857static bool
15859 int &HalfIdx1, int &HalfIdx2) {
15860 assert((Mask.size() == HalfMask.size() * 2) &&
15861 "Expected input mask to be twice as long as output");
15862
15863 // Exactly one half of the result must be undef to allow narrowing.
15864 bool UndefLower = isUndefLowerHalf(Mask);
15865 bool UndefUpper = isUndefUpperHalf(Mask);
15866 if (UndefLower == UndefUpper)
15867 return false;
15868
15869 unsigned HalfNumElts = HalfMask.size();
15870 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15871 HalfIdx1 = -1;
15872 HalfIdx2 = -1;
15873 for (unsigned i = 0; i != HalfNumElts; ++i) {
15874 int M = Mask[i + MaskIndexOffset];
15875 if (M < 0) {
15876 HalfMask[i] = M;
15877 continue;
15878 }
15879
15880 // Determine which of the 4 half vectors this element is from.
15881 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15882 int HalfIdx = M / HalfNumElts;
15883
15884 // Determine the element index into its half vector source.
15885 int HalfElt = M % HalfNumElts;
15886
15887 // We can shuffle with up to 2 half vectors, set the new 'half'
15888 // shuffle mask accordingly.
15889 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15890 HalfMask[i] = HalfElt;
15891 HalfIdx1 = HalfIdx;
15892 continue;
15893 }
15894 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15895 HalfMask[i] = HalfElt + HalfNumElts;
15896 HalfIdx2 = HalfIdx;
15897 continue;
15898 }
15899
15900 // Too many half vectors referenced.
15901 return false;
15902 }
15903
15904 return true;
15905}
15906
15907/// Given the output values from getHalfShuffleMask(), create a half width
15908/// shuffle of extracted vectors followed by an insert back to full width.
15910 ArrayRef<int> HalfMask, int HalfIdx1,
15911 int HalfIdx2, bool UndefLower,
15912 SelectionDAG &DAG, bool UseConcat = false) {
15913 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15914 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15915
15916 MVT VT = V1.getSimpleValueType();
15917 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15918 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15919
15920 auto getHalfVector = [&](int HalfIdx) {
15921 if (HalfIdx < 0)
15922 return DAG.getUNDEF(HalfVT);
15923 SDValue V = (HalfIdx < 2 ? V1 : V2);
15924 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15925 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15926 DAG.getVectorIdxConstant(HalfIdx, DL));
15927 };
15928
15929 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15930 SDValue Half1 = getHalfVector(HalfIdx1);
15931 SDValue Half2 = getHalfVector(HalfIdx2);
15932 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15933 if (UseConcat) {
15934 SDValue Op0 = V;
15935 SDValue Op1 = DAG.getUNDEF(HalfVT);
15936 if (UndefLower)
15937 std::swap(Op0, Op1);
15938 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15939 }
15940
15941 unsigned Offset = UndefLower ? HalfNumElts : 0;
15942 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15944}
15945
15946/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15947/// This allows for fast cases such as subvector extraction/insertion
15948/// or shuffling smaller vector types which can lower more efficiently.
15950 SDValue V2, ArrayRef<int> Mask,
15951 const X86Subtarget &Subtarget,
15952 SelectionDAG &DAG) {
15953 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15954 "Expected 256-bit or 512-bit vector");
15955
15956 bool UndefLower = isUndefLowerHalf(Mask);
15957 if (!UndefLower && !isUndefUpperHalf(Mask))
15958 return SDValue();
15959
15960 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15961 "Completely undef shuffle mask should have been simplified already");
15962
15963 // Upper half is undef and lower half is whole upper subvector.
15964 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15965 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15966 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15967 if (!UndefLower &&
15968 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15969 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15970 DAG.getVectorIdxConstant(HalfNumElts, DL));
15971 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15972 DAG.getVectorIdxConstant(0, DL));
15973 }
15974
15975 // Lower half is undef and upper half is whole lower subvector.
15976 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15977 if (UndefLower &&
15978 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15979 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15980 DAG.getVectorIdxConstant(0, DL));
15981 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15982 DAG.getVectorIdxConstant(HalfNumElts, DL));
15983 }
15984
15985 int HalfIdx1, HalfIdx2;
15986 SmallVector<int, 8> HalfMask(HalfNumElts);
15987 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15988 return SDValue();
15989
15990 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15991
15992 // Only shuffle the halves of the inputs when useful.
15993 unsigned NumLowerHalves =
15994 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15995 unsigned NumUpperHalves =
15996 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15997 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15998
15999 // Determine the larger pattern of undef/halves, then decide if it's worth
16000 // splitting the shuffle based on subtarget capabilities and types.
16001 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16002 if (!UndefLower) {
16003 // XXXXuuuu: no insert is needed.
16004 // Always extract lowers when setting lower - these are all free subreg ops.
16005 if (NumUpperHalves == 0)
16006 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16007 UndefLower, DAG);
16008
16009 if (NumUpperHalves == 1) {
16010 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16011 if (Subtarget.hasAVX2()) {
16012 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16013 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16014 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16015 (!isSingleSHUFPSMask(HalfMask) ||
16016 Subtarget.hasFastVariableCrossLaneShuffle()))
16017 return SDValue();
16018 // If this is an unary shuffle (assume that the 2nd operand is
16019 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16020 // are better off extracting the upper half of 1 operand and using a
16021 // narrow shuffle.
16022 if (EltWidth == 64 && V2.isUndef())
16023 return SDValue();
16024 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16025 // full width pshufb, and then merge.
16026 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16027 return SDValue();
16028 }
16029 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16030 if (Subtarget.hasAVX512() && VT.is512BitVector())
16031 return SDValue();
16032 // Extract + narrow shuffle is better than the wide alternative.
16033 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16034 UndefLower, DAG);
16035 }
16036
16037 // Don't extract both uppers, instead shuffle and then extract.
16038 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16039 return SDValue();
16040 }
16041
16042 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16043 if (NumUpperHalves == 0) {
16044 // AVX2 has efficient 64-bit element cross-lane shuffles.
16045 // TODO: Refine to account for unary shuffle, splat, and other masks?
16046 if (Subtarget.hasAVX2() && EltWidth == 64)
16047 return SDValue();
16048 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16049 if (Subtarget.hasAVX512() && VT.is512BitVector())
16050 return SDValue();
16051 // Narrow shuffle + insert is better than the wide alternative.
16052 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16053 UndefLower, DAG);
16054 }
16055
16056 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16057 return SDValue();
16058}
16059
16060/// Handle case where shuffle sources are coming from the same 128-bit lane and
16061/// every lane can be represented as the same repeating mask - allowing us to
16062/// shuffle the sources with the repeating shuffle and then permute the result
16063/// to the destination lanes.
16065 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16066 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16067 int NumElts = VT.getVectorNumElements();
16068 int NumLanes = VT.getSizeInBits() / 128;
16069 int NumLaneElts = NumElts / NumLanes;
16070
16071 // On AVX2 we may be able to just shuffle the lowest elements and then
16072 // broadcast the result.
16073 if (Subtarget.hasAVX2()) {
16074 for (unsigned BroadcastSize : {16, 32, 64}) {
16075 if (BroadcastSize <= VT.getScalarSizeInBits())
16076 continue;
16077 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16078
16079 // Attempt to match a repeating pattern every NumBroadcastElts,
16080 // accounting for UNDEFs but only references the lowest 128-bit
16081 // lane of the inputs.
16082 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16083 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16084 for (int j = 0; j != NumBroadcastElts; ++j) {
16085 int M = Mask[i + j];
16086 if (M < 0)
16087 continue;
16088 int &R = RepeatMask[j];
16089 if (0 != ((M % NumElts) / NumLaneElts))
16090 return false;
16091 if (0 <= R && R != M)
16092 return false;
16093 R = M;
16094 }
16095 return true;
16096 };
16097
16098 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16099 if (!FindRepeatingBroadcastMask(RepeatMask))
16100 continue;
16101
16102 // Shuffle the (lowest) repeated elements in place for broadcast.
16103 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16104
16105 // Shuffle the actual broadcast.
16106 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16107 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16108 for (int j = 0; j != NumBroadcastElts; ++j)
16109 BroadcastMask[i + j] = j;
16110
16111 // Avoid returning the same shuffle operation. For example,
16112 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16113 if (BroadcastMask == Mask)
16114 return SDValue();
16115
16116 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16117 BroadcastMask);
16118 }
16119 }
16120
16121 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16122 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16123 return SDValue();
16124
16125 // Bail if we already have a repeated lane shuffle mask.
16126 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16127 return SDValue();
16128
16129 // Helper to look for repeated mask in each split sublane, and that those
16130 // sublanes can then be permuted into place.
16131 auto ShuffleSubLanes = [&](int SubLaneScale) {
16132 int NumSubLanes = NumLanes * SubLaneScale;
16133 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16134
16135 // Check that all the sources are coming from the same lane and see if we
16136 // can form a repeating shuffle mask (local to each sub-lane). At the same
16137 // time, determine the source sub-lane for each destination sub-lane.
16138 int TopSrcSubLane = -1;
16139 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16140 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16141 SubLaneScale,
16142 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16143
16144 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16145 // Extract the sub-lane mask, check that it all comes from the same lane
16146 // and normalize the mask entries to come from the first lane.
16147 int SrcLane = -1;
16148 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16149 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16150 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16151 if (M < 0)
16152 continue;
16153 int Lane = (M % NumElts) / NumLaneElts;
16154 if ((0 <= SrcLane) && (SrcLane != Lane))
16155 return SDValue();
16156 SrcLane = Lane;
16157 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16158 SubLaneMask[Elt] = LocalM;
16159 }
16160
16161 // Whole sub-lane is UNDEF.
16162 if (SrcLane < 0)
16163 continue;
16164
16165 // Attempt to match against the candidate repeated sub-lane masks.
16166 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16167 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16168 for (int i = 0; i != NumSubLaneElts; ++i) {
16169 if (M1[i] < 0 || M2[i] < 0)
16170 continue;
16171 if (M1[i] != M2[i])
16172 return false;
16173 }
16174 return true;
16175 };
16176
16177 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16178 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16179 continue;
16180
16181 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16182 for (int i = 0; i != NumSubLaneElts; ++i) {
16183 int M = SubLaneMask[i];
16184 if (M < 0)
16185 continue;
16186 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16187 "Unexpected mask element");
16188 RepeatedSubLaneMask[i] = M;
16189 }
16190
16191 // Track the top most source sub-lane - by setting the remaining to
16192 // UNDEF we can greatly simplify shuffle matching.
16193 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16194 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16195 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16196 break;
16197 }
16198
16199 // Bail if we failed to find a matching repeated sub-lane mask.
16200 if (Dst2SrcSubLanes[DstSubLane] < 0)
16201 return SDValue();
16202 }
16203 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16204 "Unexpected source lane");
16205
16206 // Create a repeating shuffle mask for the entire vector.
16207 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16208 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16209 int Lane = SubLane / SubLaneScale;
16210 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16211 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16212 int M = RepeatedSubLaneMask[Elt];
16213 if (M < 0)
16214 continue;
16215 int Idx = (SubLane * NumSubLaneElts) + Elt;
16216 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16217 }
16218 }
16219
16220 // Shuffle each source sub-lane to its destination.
16221 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16222 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16223 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16224 if (SrcSubLane < 0)
16225 continue;
16226 for (int j = 0; j != NumSubLaneElts; ++j)
16227 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16228 }
16229
16230 // Avoid returning the same shuffle operation.
16231 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16232 if (RepeatedMask == Mask || SubLaneMask == Mask)
16233 return SDValue();
16234
16235 SDValue RepeatedShuffle =
16236 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16237
16238 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16239 SubLaneMask);
16240 };
16241
16242 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16243 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16244 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16245 // Otherwise we can only permute whole 128-bit lanes.
16246 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16247 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16248 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16249 MinSubLaneScale = 2;
16250 MaxSubLaneScale =
16251 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16252 }
16253 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16254 MinSubLaneScale = MaxSubLaneScale = 4;
16255
16256 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16257 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16258 return Shuffle;
16259
16260 return SDValue();
16261}
16262
16264 bool &ForceV1Zero, bool &ForceV2Zero,
16265 unsigned &ShuffleImm, ArrayRef<int> Mask,
16266 const APInt &Zeroable) {
16267 int NumElts = VT.getVectorNumElements();
16268 assert(VT.getScalarSizeInBits() == 64 &&
16269 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16270 "Unexpected data type for VSHUFPD");
16271 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16272 "Illegal shuffle mask");
16273
16274 bool ZeroLane[2] = { true, true };
16275 for (int i = 0; i < NumElts; ++i)
16276 ZeroLane[i & 1] &= Zeroable[i];
16277
16278 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16279 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16280 bool IsSHUFPD = true;
16281 bool IsCommutable = true;
16282 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16283 for (int i = 0; i < NumElts; ++i) {
16284 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16285 continue;
16286 if (Mask[i] < 0)
16287 return false;
16288 int Val = (i & 6) + NumElts * (i & 1);
16289 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16290 if (Mask[i] < Val || Mask[i] > Val + 1)
16291 IsSHUFPD = false;
16292 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16293 IsCommutable = false;
16294 SHUFPDMask[i] = Mask[i] % 2;
16295 }
16296
16297 if (!IsSHUFPD && !IsCommutable)
16298 return false;
16299
16300 if (!IsSHUFPD && IsCommutable)
16301 std::swap(V1, V2);
16302
16303 ForceV1Zero = ZeroLane[0];
16304 ForceV2Zero = ZeroLane[1];
16305 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16306 return true;
16307}
16308
16310 SDValue V2, ArrayRef<int> Mask,
16311 const APInt &Zeroable,
16312 const X86Subtarget &Subtarget,
16313 SelectionDAG &DAG) {
16314 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16315 "Unexpected data type for VSHUFPD");
16316
16317 unsigned Immediate = 0;
16318 bool ForceV1Zero = false, ForceV2Zero = false;
16319 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16320 Mask, Zeroable))
16321 return SDValue();
16322
16323 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16324 if (ForceV1Zero)
16325 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16326 if (ForceV2Zero)
16327 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16328
16329 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16330 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16331}
16332
16333// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16334// by zeroable elements in the remaining 24 elements. Turn this into two
16335// vmovqb instructions shuffled together.
16337 SDValue V1, SDValue V2,
16338 ArrayRef<int> Mask,
16339 const APInt &Zeroable,
16340 SelectionDAG &DAG) {
16341 assert(VT == MVT::v32i8 && "Unexpected type!");
16342
16343 // The first 8 indices should be every 8th element.
16344 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16345 return SDValue();
16346
16347 // Remaining elements need to be zeroable.
16348 if (Zeroable.countl_one() < (Mask.size() - 8))
16349 return SDValue();
16350
16351 V1 = DAG.getBitcast(MVT::v4i64, V1);
16352 V2 = DAG.getBitcast(MVT::v4i64, V2);
16353
16354 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16355 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16356
16357 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16358 // the upper bits of the result using an unpckldq.
16359 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16360 { 0, 1, 2, 3, 16, 17, 18, 19,
16361 4, 5, 6, 7, 20, 21, 22, 23 });
16362 // Insert the unpckldq into a zero vector to widen to v32i8.
16363 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16364 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16365 DAG.getVectorIdxConstant(0, DL));
16366}
16367
16368// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16369// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16370// =>
16371// ul = unpckl v1, v2
16372// uh = unpckh v1, v2
16373// a = vperm ul, uh
16374// b = vperm ul, uh
16375//
16376// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16377// and permute. We cannot directly match v3 because it is split into two
16378// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16379// pair of 256-bit shuffles and makes sure the masks are consecutive.
16380//
16381// Once unpck and permute nodes are created, the permute corresponding to this
16382// shuffle is returned, while the other permute replaces the other half of the
16383// shuffle in the selection dag.
16385 SDValue V1, SDValue V2,
16386 ArrayRef<int> Mask,
16387 SelectionDAG &DAG) {
16388 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16389 VT != MVT::v32i8)
16390 return SDValue();
16391 // <B0, B1, B0+1, B1+1, ..., >
16392 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16393 unsigned Begin1) {
16394 size_t Size = Mask.size();
16395 assert(Size % 2 == 0 && "Expected even mask size");
16396 for (unsigned I = 0; I < Size; I += 2) {
16397 if (Mask[I] != (int)(Begin0 + I / 2) ||
16398 Mask[I + 1] != (int)(Begin1 + I / 2))
16399 return false;
16400 }
16401 return true;
16402 };
16403 // Check which half is this shuffle node
16404 int NumElts = VT.getVectorNumElements();
16405 size_t FirstQtr = NumElts / 2;
16406 size_t ThirdQtr = NumElts + NumElts / 2;
16407 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16408 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16409 if (!IsFirstHalf && !IsSecondHalf)
16410 return SDValue();
16411
16412 // Find the intersection between shuffle users of V1 and V2.
16413 SmallVector<SDNode *, 2> Shuffles;
16414 for (SDNode *User : V1->users())
16415 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16416 User->getOperand(1) == V2)
16417 Shuffles.push_back(User);
16418 // Limit user size to two for now.
16419 if (Shuffles.size() != 2)
16420 return SDValue();
16421 // Find out which half of the 512-bit shuffles is each smaller shuffle
16422 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16423 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16424 SDNode *FirstHalf;
16425 SDNode *SecondHalf;
16426 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16427 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16428 FirstHalf = Shuffles[0];
16429 SecondHalf = Shuffles[1];
16430 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16431 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16432 FirstHalf = Shuffles[1];
16433 SecondHalf = Shuffles[0];
16434 } else {
16435 return SDValue();
16436 }
16437 // Lower into unpck and perm. Return the perm of this shuffle and replace
16438 // the other.
16439 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16440 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16441 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16442 DAG.getTargetConstant(0x20, DL, MVT::i8));
16443 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16444 DAG.getTargetConstant(0x31, DL, MVT::i8));
16445 if (IsFirstHalf) {
16446 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16447 return Perm1;
16448 }
16449 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16450 return Perm2;
16451}
16452
16453/// Handle lowering of 4-lane 64-bit floating point shuffles.
16454///
16455/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16456/// isn't available.
16458 const APInt &Zeroable, SDValue V1, SDValue V2,
16459 const X86Subtarget &Subtarget,
16460 SelectionDAG &DAG) {
16461 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16462 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16463 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16464
16465 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16466 Subtarget, DAG))
16467 return V;
16468
16469 if (V2.isUndef()) {
16470 // Check for being able to broadcast a single element.
16471 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16472 Mask, Subtarget, DAG))
16473 return Broadcast;
16474
16475 // Use low duplicate instructions for masks that match their pattern.
16476 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16477 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16478
16479 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16480 // Non-half-crossing single input shuffles can be lowered with an
16481 // interleaved permutation.
16482 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16483 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16484 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16485 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16486 }
16487
16488 // With AVX2 we have direct support for this permutation.
16489 if (Subtarget.hasAVX2())
16490 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16491 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16492
16493 // Try to create an in-lane repeating shuffle mask and then shuffle the
16494 // results into the target lanes.
16496 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16497 return V;
16498
16499 // Try to permute the lanes and then use a per-lane permute.
16500 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16501 Mask, DAG, Subtarget))
16502 return V;
16503
16504 // Otherwise, fall back.
16505 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16506 DAG, Subtarget);
16507 }
16508
16509 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16510 Zeroable, Subtarget, DAG))
16511 return Blend;
16512
16513 // Use dedicated unpack instructions for masks that match their pattern.
16514 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16515 return V;
16516
16517 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16518 Zeroable, Subtarget, DAG))
16519 return Op;
16520
16521 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16522 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16523 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16524 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16525
16526 // If we have lane crossing shuffles AND they don't all come from the lower
16527 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16528 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16529 // canonicalize to a blend of splat which isn't necessary for this combine.
16530 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16531 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16532 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16533 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16534 (!Subtarget.hasAVX2() ||
16535 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16536 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16537
16538 // If we have one input in place, then we can permute the other input and
16539 // blend the result.
16540 if (V1IsInPlace || V2IsInPlace)
16541 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16542 Zeroable, Subtarget, DAG);
16543
16544 // Try to create an in-lane repeating shuffle mask and then shuffle the
16545 // results into the target lanes.
16547 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16548 return V;
16549
16550 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16551 // shuffle. However, if we have AVX2 and either inputs are already in place,
16552 // we will be able to shuffle even across lanes the other input in a single
16553 // instruction so skip this pattern.
16554 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16556 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16557 return V;
16558
16559 // If we have VLX support, we can use VEXPAND.
16560 if (Subtarget.hasVLX())
16561 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16562 Zeroable, Subtarget, DAG))
16563 return V;
16564
16565 // If we have AVX2 then we always want to lower with a blend because an v4 we
16566 // can fully permute the elements.
16567 if (Subtarget.hasAVX2())
16568 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16569 Zeroable, Subtarget, DAG);
16570
16571 // Otherwise fall back on generic lowering.
16572 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16573 Subtarget, DAG);
16574}
16575
16576/// Handle lowering of 4-lane 64-bit integer shuffles.
16577///
16578/// This routine is only called when we have AVX2 and thus a reasonable
16579/// instruction set for v4i64 shuffling..
16581 const APInt &Zeroable, SDValue V1, SDValue V2,
16582 const X86Subtarget &Subtarget,
16583 SelectionDAG &DAG) {
16584 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16585 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16586 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16587 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16588
16589 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16590 Subtarget, DAG))
16591 return V;
16592
16593 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16594 Zeroable, Subtarget, DAG))
16595 return Blend;
16596
16597 // Check for being able to broadcast a single element.
16598 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16599 Subtarget, DAG))
16600 return Broadcast;
16601
16602 // Try to use shift instructions if fast.
16603 if (Subtarget.preferLowerShuffleAsShift())
16604 if (SDValue Shift =
16605 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16606 Subtarget, DAG, /*BitwiseOnly*/ true))
16607 return Shift;
16608
16609 if (V2.isUndef()) {
16610 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16611 // can use lower latency instructions that will operate on both lanes.
16612 SmallVector<int, 2> RepeatedMask;
16613 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16614 SmallVector<int, 4> PSHUFDMask;
16615 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16616 return DAG.getBitcast(
16617 MVT::v4i64,
16618 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16619 DAG.getBitcast(MVT::v8i32, V1),
16620 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16621 }
16622
16623 // AVX2 provides a direct instruction for permuting a single input across
16624 // lanes.
16625 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16626 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16627 }
16628
16629 // Try to use shift instructions.
16630 if (SDValue Shift =
16631 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16632 DAG, /*BitwiseOnly*/ false))
16633 return Shift;
16634
16635 // If we have VLX support, we can use VALIGN or VEXPAND.
16636 if (Subtarget.hasVLX()) {
16637 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16638 Zeroable, Subtarget, DAG))
16639 return Rotate;
16640
16641 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16642 Zeroable, Subtarget, DAG))
16643 return V;
16644 }
16645
16646 // Try to use PALIGNR.
16647 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16648 Subtarget, DAG))
16649 return Rotate;
16650
16651 // Use dedicated unpack instructions for masks that match their pattern.
16652 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16653 return V;
16654
16655 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16656 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16657
16658 // If we have one input in place, then we can permute the other input and
16659 // blend the result.
16660 if (V1IsInPlace || V2IsInPlace)
16661 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16662 Zeroable, Subtarget, DAG);
16663
16664 // Try to create an in-lane repeating shuffle mask and then shuffle the
16665 // results into the target lanes.
16667 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16668 return V;
16669
16670 // Try to lower to PERMQ(BLENDD(V1,V2)).
16671 if (SDValue V =
16672 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16673 return V;
16674
16675 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16676 // shuffle. However, if we have AVX2 and either inputs are already in place,
16677 // we will be able to shuffle even across lanes the other input in a single
16678 // instruction so skip this pattern.
16679 if (!V1IsInPlace && !V2IsInPlace)
16681 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16682 return Result;
16683
16684 // Otherwise fall back on generic blend lowering.
16685 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16686 Zeroable, Subtarget, DAG);
16687}
16688
16689/// Handle lowering of 8-lane 32-bit floating point shuffles.
16690///
16691/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16692/// isn't available.
16694 const APInt &Zeroable, SDValue V1, SDValue V2,
16695 const X86Subtarget &Subtarget,
16696 SelectionDAG &DAG) {
16697 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16698 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16699 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16700
16701 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16702 Zeroable, Subtarget, DAG))
16703 return Blend;
16704
16705 // Check for being able to broadcast a single element.
16706 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16707 Subtarget, DAG))
16708 return Broadcast;
16709
16710 if (!Subtarget.hasAVX2()) {
16711 SmallVector<int> InLaneMask;
16712 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16713
16714 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16715 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16716 /*SimpleOnly*/ true))
16717 return R;
16718 }
16719 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16720 Zeroable, Subtarget, DAG))
16721 return DAG.getBitcast(MVT::v8f32, ZExt);
16722
16723 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16724 // options to efficiently lower the shuffle.
16725 SmallVector<int, 4> RepeatedMask;
16726 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16727 assert(RepeatedMask.size() == 4 &&
16728 "Repeated masks must be half the mask width!");
16729
16730 // Use even/odd duplicate instructions for masks that match their pattern.
16731 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16732 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16733 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16734 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16735
16736 if (V2.isUndef())
16737 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16738 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16739
16740 // Use dedicated unpack instructions for masks that match their pattern.
16741 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16742 return V;
16743
16744 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16745 // have already handled any direct blends.
16746 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16747 }
16748
16749 // Try to create an in-lane repeating shuffle mask and then shuffle the
16750 // results into the target lanes.
16752 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16753 return V;
16754
16755 // If we have a single input shuffle with different shuffle patterns in the
16756 // two 128-bit lanes use the variable mask to VPERMILPS.
16757 if (V2.isUndef()) {
16758 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16759 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16760 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16761 }
16762 if (Subtarget.hasAVX2()) {
16763 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16764 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16765 }
16766 // Otherwise, fall back.
16767 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16768 DAG, Subtarget);
16769 }
16770
16771 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16772 // shuffle.
16774 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16775 return Result;
16776
16777 // If we have VLX support, we can use VEXPAND.
16778 if (Subtarget.hasVLX())
16779 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16780 Zeroable, Subtarget, DAG))
16781 return V;
16782
16783 // Try to match an interleave of two v8f32s and lower them as unpck and
16784 // permutes using ymms. This needs to go before we try to split the vectors.
16785 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16786 if ((Subtarget.hasAVX2() ||
16789 !Subtarget.hasAVX512())
16790 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16791 Mask, DAG))
16792 return V;
16793
16794 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16795 // since after split we get a more efficient code using vpunpcklwd and
16796 // vpunpckhwd instrs than vblend.
16797 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16798 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16799 Subtarget, DAG);
16800
16801 // If we have AVX2 then we always want to lower with a blend because at v8 we
16802 // can fully permute the elements.
16803 if (Subtarget.hasAVX2())
16804 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16805 Zeroable, Subtarget, DAG);
16806
16807 // Otherwise fall back on generic lowering.
16808 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16809 Subtarget, DAG);
16810}
16811
16812/// Handle lowering of 8-lane 32-bit integer shuffles.
16813///
16814/// This routine is only called when we have AVX2 and thus a reasonable
16815/// instruction set for v8i32 shuffling..
16817 const APInt &Zeroable, SDValue V1, SDValue V2,
16818 const X86Subtarget &Subtarget,
16819 SelectionDAG &DAG) {
16820 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16821 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16822 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16823 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16824
16825 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16826
16827 // Whenever we can lower this as a zext, that instruction is strictly faster
16828 // than any alternative. It also allows us to fold memory operands into the
16829 // shuffle in many cases.
16830 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16831 Zeroable, Subtarget, DAG))
16832 return ZExt;
16833
16834 // Try to match an interleave of two v8i32s and lower them as unpck and
16835 // permutes using ymms. This needs to go before we try to split the vectors.
16836 if (!Subtarget.hasAVX512())
16837 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16838 Mask, DAG))
16839 return V;
16840
16841 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16842 // since after split we get a more efficient code than vblend by using
16843 // vpunpcklwd and vpunpckhwd instrs.
16844 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16845 !Subtarget.hasAVX512())
16846 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16847 Subtarget, DAG);
16848
16849 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16850 Zeroable, Subtarget, DAG))
16851 return Blend;
16852
16853 // Check for being able to broadcast a single element.
16854 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16855 Subtarget, DAG))
16856 return Broadcast;
16857
16858 // Try to use shift instructions if fast.
16859 if (Subtarget.preferLowerShuffleAsShift()) {
16860 if (SDValue Shift =
16861 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16862 Subtarget, DAG, /*BitwiseOnly*/ true))
16863 return Shift;
16864 if (NumV2Elements == 0)
16865 if (SDValue Rotate =
16866 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16867 return Rotate;
16868 }
16869
16870 // If the shuffle mask is repeated in each 128-bit lane we can use more
16871 // efficient instructions that mirror the shuffles across the two 128-bit
16872 // lanes.
16873 SmallVector<int, 4> RepeatedMask;
16874 bool Is128BitLaneRepeatedShuffle =
16875 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16876 if (Is128BitLaneRepeatedShuffle) {
16877 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16878 if (V2.isUndef())
16879 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16880 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16881
16882 // Use dedicated unpack instructions for masks that match their pattern.
16883 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16884 return V;
16885 }
16886
16887 // Try to use shift instructions.
16888 if (SDValue Shift =
16889 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16890 DAG, /*BitwiseOnly*/ false))
16891 return Shift;
16892
16893 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16894 if (SDValue Rotate =
16895 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16896 return Rotate;
16897
16898 // If we have VLX support, we can use VALIGN or EXPAND.
16899 if (Subtarget.hasVLX()) {
16900 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16901 Zeroable, Subtarget, DAG))
16902 return Rotate;
16903
16904 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16905 Zeroable, Subtarget, DAG))
16906 return V;
16907 }
16908
16909 // Try to use byte rotation instructions.
16910 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16911 Subtarget, DAG))
16912 return Rotate;
16913
16914 // Try to create an in-lane repeating shuffle mask and then shuffle the
16915 // results into the target lanes.
16917 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16918 return V;
16919
16920 if (V2.isUndef()) {
16921 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16922 // because that should be faster than the variable permute alternatives.
16923 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16924 return V;
16925
16926 // If the shuffle patterns aren't repeated but it's a single input, directly
16927 // generate a cross-lane VPERMD instruction.
16928 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16929 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16930 }
16931
16932 // Assume that a single SHUFPS is faster than an alternative sequence of
16933 // multiple instructions (even if the CPU has a domain penalty).
16934 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16935 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16936 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16937 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16938 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16939 CastV1, CastV2, DAG);
16940 return DAG.getBitcast(MVT::v8i32, ShufPS);
16941 }
16942
16943 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16944 // shuffle.
16946 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16947 return Result;
16948
16949 // Otherwise fall back on generic blend lowering.
16950 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16951 Zeroable, Subtarget, DAG);
16952}
16953
16954/// Handle lowering of 16-lane 16-bit integer shuffles.
16955///
16956/// This routine is only called when we have AVX2 and thus a reasonable
16957/// instruction set for v16i16 shuffling..
16959 const APInt &Zeroable, SDValue V1, SDValue V2,
16960 const X86Subtarget &Subtarget,
16961 SelectionDAG &DAG) {
16962 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16963 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16964 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16965 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16966
16967 // Whenever we can lower this as a zext, that instruction is strictly faster
16968 // than any alternative. It also allows us to fold memory operands into the
16969 // shuffle in many cases.
16971 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16972 return ZExt;
16973
16974 // Check for being able to broadcast a single element.
16975 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16976 Subtarget, DAG))
16977 return Broadcast;
16978
16979 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16980 Zeroable, Subtarget, DAG))
16981 return Blend;
16982
16983 // Use dedicated unpack instructions for masks that match their pattern.
16984 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16985 return V;
16986
16987 // Use dedicated pack instructions for masks that match their pattern.
16988 if (SDValue V =
16989 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16990 return V;
16991
16992 // Try to use lower using a truncation.
16993 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16994 Subtarget, DAG))
16995 return V;
16996
16997 // Try to use shift instructions.
16998 if (SDValue Shift =
16999 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17000 Subtarget, DAG, /*BitwiseOnly*/ false))
17001 return Shift;
17002
17003 // Try to use byte rotation instructions.
17004 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17005 Subtarget, DAG))
17006 return Rotate;
17007
17008 // Try to create an in-lane repeating shuffle mask and then shuffle the
17009 // results into the target lanes.
17011 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17012 return V;
17013
17014 if (V2.isUndef()) {
17015 // Try to use bit rotation instructions.
17016 if (SDValue Rotate =
17017 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17018 return Rotate;
17019
17020 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17021 // because that should be faster than the variable permute alternatives.
17022 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17023 return V;
17024
17025 // There are no generalized cross-lane shuffle operations available on i16
17026 // element types.
17027 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17029 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17030 return V;
17031
17032 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17033 DAG, Subtarget);
17034 }
17035
17036 SmallVector<int, 8> RepeatedMask;
17037 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17038 // As this is a single-input shuffle, the repeated mask should be
17039 // a strictly valid v8i16 mask that we can pass through to the v8i16
17040 // lowering to handle even the v16 case.
17042 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17043 }
17044 }
17045
17046 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17047 Zeroable, Subtarget, DAG))
17048 return PSHUFB;
17049
17050 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17051 if (Subtarget.hasBWI())
17052 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17053
17054 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17055 // shuffle.
17057 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17058 return Result;
17059
17060 // Try to permute the lanes and then use a per-lane permute.
17062 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17063 return V;
17064
17065 // Try to match an interleave of two v16i16s and lower them as unpck and
17066 // permutes using ymms.
17067 if (!Subtarget.hasAVX512())
17068 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17069 Mask, DAG))
17070 return V;
17071
17072 // Otherwise fall back on generic lowering.
17073 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17074 Subtarget, DAG);
17075}
17076
17077/// Handle lowering of 32-lane 8-bit integer shuffles.
17078///
17079/// This routine is only called when we have AVX2 and thus a reasonable
17080/// instruction set for v32i8 shuffling..
17082 const APInt &Zeroable, SDValue V1, SDValue V2,
17083 const X86Subtarget &Subtarget,
17084 SelectionDAG &DAG) {
17085 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17086 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17087 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17088 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17089
17090 // Whenever we can lower this as a zext, that instruction is strictly faster
17091 // than any alternative. It also allows us to fold memory operands into the
17092 // shuffle in many cases.
17093 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17094 Zeroable, Subtarget, DAG))
17095 return ZExt;
17096
17097 // Check for being able to broadcast a single element.
17098 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17099 Subtarget, DAG))
17100 return Broadcast;
17101
17102 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17103 Zeroable, Subtarget, DAG))
17104 return Blend;
17105
17106 // Use dedicated unpack instructions for masks that match their pattern.
17107 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17108 return V;
17109
17110 // Use dedicated pack instructions for masks that match their pattern.
17111 if (SDValue V =
17112 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17113 return V;
17114
17115 // Try to use lower using a truncation.
17116 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17117 Subtarget, DAG))
17118 return V;
17119
17120 // Try to use shift instructions.
17121 if (SDValue Shift =
17122 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17123 DAG, /*BitwiseOnly*/ false))
17124 return Shift;
17125
17126 // Try to use byte rotation instructions.
17127 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17128 Subtarget, DAG))
17129 return Rotate;
17130
17131 // Try to use bit rotation instructions.
17132 if (V2.isUndef())
17133 if (SDValue Rotate =
17134 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17135 return Rotate;
17136
17137 // Try to create an in-lane repeating shuffle mask and then shuffle the
17138 // results into the target lanes.
17140 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17141 return V;
17142
17143 // There are no generalized cross-lane shuffle operations available on i8
17144 // element types.
17145 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17146 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17147 // because that should be faster than the variable permute alternatives.
17148 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17149 return V;
17150
17152 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17153 return V;
17154
17155 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17156 DAG, Subtarget);
17157 }
17158
17159 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17160 Zeroable, Subtarget, DAG))
17161 return PSHUFB;
17162
17163 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17164 if (Subtarget.hasVBMI())
17165 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17166
17167 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17168 // shuffle.
17170 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17171 return Result;
17172
17173 // Try to permute the lanes and then use a per-lane permute.
17175 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17176 return V;
17177
17178 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17179 // by zeroable elements in the remaining 24 elements. Turn this into two
17180 // vmovqb instructions shuffled together.
17181 if (Subtarget.hasVLX())
17182 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17183 Mask, Zeroable, DAG))
17184 return V;
17185
17186 // Try to match an interleave of two v32i8s and lower them as unpck and
17187 // permutes using ymms.
17188 if (!Subtarget.hasAVX512())
17189 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17190 Mask, DAG))
17191 return V;
17192
17193 // Otherwise fall back on generic lowering.
17194 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17195 Subtarget, DAG);
17196}
17197
17198/// High-level routine to lower various 256-bit x86 vector shuffles.
17199///
17200/// This routine either breaks down the specific type of a 256-bit x86 vector
17201/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17202/// together based on the available instructions.
17204 SDValue V1, SDValue V2, const APInt &Zeroable,
17205 const X86Subtarget &Subtarget,
17206 SelectionDAG &DAG) {
17207 // If we have a single input to the zero element, insert that into V1 if we
17208 // can do so cheaply.
17209 int NumElts = VT.getVectorNumElements();
17210 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17211
17212 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17214 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17215 return Insertion;
17216
17217 // Handle special cases where the lower or upper half is UNDEF.
17218 if (SDValue V =
17219 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17220 return V;
17221
17222 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17223 // can check for those subtargets here and avoid much of the subtarget
17224 // querying in the per-vector-type lowering routines. With AVX1 we have
17225 // essentially *zero* ability to manipulate a 256-bit vector with integer
17226 // types. Since we'll use floating point types there eventually, just
17227 // immediately cast everything to a float and operate entirely in that domain.
17228 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17229 int ElementBits = VT.getScalarSizeInBits();
17230 if (ElementBits < 32) {
17231 // No floating point type available, if we can't use the bit operations
17232 // for masking/blending then decompose into 128-bit vectors.
17233 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17234 Subtarget, DAG))
17235 return V;
17236 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17237 return V;
17238 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17239 }
17240
17241 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17243 V1 = DAG.getBitcast(FpVT, V1);
17244 V2 = DAG.getBitcast(FpVT, V2);
17245 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17246 }
17247
17248 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17249 V1 = DAG.getBitcast(MVT::v16i16, V1);
17250 V2 = DAG.getBitcast(MVT::v16i16, V2);
17251 return DAG.getBitcast(VT,
17252 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17253 }
17254
17255 switch (VT.SimpleTy) {
17256 case MVT::v4f64:
17257 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v4i64:
17259 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v8f32:
17261 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v8i32:
17263 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v16i16:
17265 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266 case MVT::v32i8:
17267 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17268
17269 default:
17270 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17271 }
17272}
17273
17274/// Try to lower a vector shuffle as a 128-bit shuffles.
17276 const APInt &Zeroable, SDValue V1, SDValue V2,
17277 const X86Subtarget &Subtarget,
17278 SelectionDAG &DAG) {
17279 assert(VT.getScalarSizeInBits() == 64 &&
17280 "Unexpected element type size for 128bit shuffle.");
17281
17282 // To handle 256 bit vector requires VLX and most probably
17283 // function lowerV2X128VectorShuffle() is better solution.
17284 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17285
17286 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17287 SmallVector<int, 4> Widened128Mask;
17288 if (!canWidenShuffleElements(Mask, Widened128Mask))
17289 return SDValue();
17290 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17291
17292 // Try to use an insert into a zero vector.
17293 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17294 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17295 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17296 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17297 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17298 DAG.getVectorIdxConstant(0, DL));
17299 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17300 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17301 DAG.getVectorIdxConstant(0, DL));
17302 }
17303
17304 // Check for patterns which can be matched with a single insert of a 256-bit
17305 // subvector.
17306 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17307 if (OnlyUsesV1 ||
17308 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17309 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17310 SDValue SubVec =
17311 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17312 DAG.getVectorIdxConstant(0, DL));
17313 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17314 DAG.getVectorIdxConstant(4, DL));
17315 }
17316
17317 // See if this is an insertion of the lower 128-bits of V2 into V1.
17318 bool IsInsert = true;
17319 int V2Index = -1;
17320 for (int i = 0; i < 4; ++i) {
17321 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17322 if (Widened128Mask[i] < 0)
17323 continue;
17324
17325 // Make sure all V1 subvectors are in place.
17326 if (Widened128Mask[i] < 4) {
17327 if (Widened128Mask[i] != i) {
17328 IsInsert = false;
17329 break;
17330 }
17331 } else {
17332 // Make sure we only have a single V2 index and its the lowest 128-bits.
17333 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17334 IsInsert = false;
17335 break;
17336 }
17337 V2Index = i;
17338 }
17339 }
17340 if (IsInsert && V2Index >= 0) {
17341 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17342 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17343 DAG.getVectorIdxConstant(0, DL));
17344 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17345 }
17346
17347 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17348 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17349 // possible we at least ensure the lanes stay sequential to help later
17350 // combines.
17351 SmallVector<int, 2> Widened256Mask;
17352 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17353 Widened128Mask.clear();
17354 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17355 }
17356
17357 // Try to lower to vshuf64x2/vshuf32x4.
17358 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17359 int PermMask[4] = {-1, -1, -1, -1};
17360 // Ensure elements came from the same Op.
17361 for (int i = 0; i < 4; ++i) {
17362 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17363 if (Widened128Mask[i] < 0)
17364 continue;
17365
17366 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17367 unsigned OpIndex = i / 2;
17368 if (Ops[OpIndex].isUndef())
17369 Ops[OpIndex] = Op;
17370 else if (Ops[OpIndex] != Op)
17371 return SDValue();
17372
17373 PermMask[i] = Widened128Mask[i] % 4;
17374 }
17375
17376 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17377 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17378}
17379
17380/// Handle lowering of 8-lane 64-bit floating point shuffles.
17382 const APInt &Zeroable, SDValue V1, SDValue V2,
17383 const X86Subtarget &Subtarget,
17384 SelectionDAG &DAG) {
17385 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17386 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17387 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17388
17389 if (V2.isUndef()) {
17390 // Use low duplicate instructions for masks that match their pattern.
17391 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17392 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17393
17394 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17395 // Non-half-crossing single input shuffles can be lowered with an
17396 // interleaved permutation.
17397 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17398 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17399 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17400 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17401 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17402 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17403 }
17404
17405 SmallVector<int, 4> RepeatedMask;
17406 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17407 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17408 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17409 }
17410
17411 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17412 V2, Subtarget, DAG))
17413 return Shuf128;
17414
17415 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17416 return Unpck;
17417
17418 // Check if the blend happens to exactly fit that of SHUFPD.
17419 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17420 Zeroable, Subtarget, DAG))
17421 return Op;
17422
17423 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17424 Subtarget, DAG))
17425 return V;
17426
17427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17428 Zeroable, Subtarget, DAG))
17429 return Blend;
17430
17431 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17432}
17433
17434/// Handle lowering of 16-lane 32-bit floating point shuffles.
17436 const APInt &Zeroable, SDValue V1, SDValue V2,
17437 const X86Subtarget &Subtarget,
17438 SelectionDAG &DAG) {
17439 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17440 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17441 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17442
17443 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17444 // options to efficiently lower the shuffle.
17445 SmallVector<int, 4> RepeatedMask;
17446 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17447 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17448
17449 // Use even/odd duplicate instructions for masks that match their pattern.
17450 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17451 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17452 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17453 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17454
17455 if (V2.isUndef())
17456 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17457 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17458
17459 // Use dedicated unpack instructions for masks that match their pattern.
17460 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17461 return V;
17462
17463 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17464 Zeroable, Subtarget, DAG))
17465 return Blend;
17466
17467 // Otherwise, fall back to a SHUFPS sequence.
17468 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17469 }
17470
17471 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17472 Zeroable, Subtarget, DAG))
17473 return Blend;
17474
17476 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17477 return DAG.getBitcast(MVT::v16f32, ZExt);
17478
17479 // Try to create an in-lane repeating shuffle mask and then shuffle the
17480 // results into the target lanes.
17482 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17483 return V;
17484
17485 // If we have a single input shuffle with different shuffle patterns in the
17486 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17487 if (V2.isUndef() &&
17488 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17489 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17490 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17491 }
17492
17493 // If we have AVX512F support, we can use VEXPAND.
17494 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17495 Zeroable, Subtarget, DAG))
17496 return V;
17497
17498 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17499}
17500
17501/// Handle lowering of 8-lane 64-bit integer shuffles.
17503 const APInt &Zeroable, SDValue V1, SDValue V2,
17504 const X86Subtarget &Subtarget,
17505 SelectionDAG &DAG) {
17506 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17507 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17508 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17509
17510 // Try to use shift instructions if fast.
17511 if (Subtarget.preferLowerShuffleAsShift())
17512 if (SDValue Shift =
17513 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17514 Subtarget, DAG, /*BitwiseOnly*/ true))
17515 return Shift;
17516
17517 if (V2.isUndef()) {
17518 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17519 // can use lower latency instructions that will operate on all four
17520 // 128-bit lanes.
17521 SmallVector<int, 2> Repeated128Mask;
17522 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17523 SmallVector<int, 4> PSHUFDMask;
17524 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17525 return DAG.getBitcast(
17526 MVT::v8i64,
17527 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17528 DAG.getBitcast(MVT::v16i32, V1),
17529 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17530 }
17531
17532 SmallVector<int, 4> Repeated256Mask;
17533 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17534 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17535 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17536 }
17537
17538 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17539 V2, Subtarget, DAG))
17540 return Shuf128;
17541
17542 // Try to use shift instructions.
17543 if (SDValue Shift =
17544 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17545 DAG, /*BitwiseOnly*/ false))
17546 return Shift;
17547
17548 // Try to use VALIGN.
17549 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17550 Zeroable, Subtarget, DAG))
17551 return Rotate;
17552
17553 // Try to use PALIGNR.
17554 if (Subtarget.hasBWI())
17555 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17556 Subtarget, DAG))
17557 return Rotate;
17558
17559 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17560 return Unpck;
17561
17562 // If we have AVX512F support, we can use VEXPAND.
17563 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17564 Subtarget, DAG))
17565 return V;
17566
17567 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17568 Zeroable, Subtarget, DAG))
17569 return Blend;
17570
17571 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17572}
17573
17574/// Handle lowering of 16-lane 32-bit integer shuffles.
17576 const APInt &Zeroable, SDValue V1, SDValue V2,
17577 const X86Subtarget &Subtarget,
17578 SelectionDAG &DAG) {
17579 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17580 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17581 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17582
17583 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17584
17585 // Whenever we can lower this as a zext, that instruction is strictly faster
17586 // than any alternative. It also allows us to fold memory operands into the
17587 // shuffle in many cases.
17589 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17590 return ZExt;
17591
17592 // Try to use shift instructions if fast.
17593 if (Subtarget.preferLowerShuffleAsShift()) {
17594 if (SDValue Shift =
17595 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17596 Subtarget, DAG, /*BitwiseOnly*/ true))
17597 return Shift;
17598 if (NumV2Elements == 0)
17599 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17600 Subtarget, DAG))
17601 return Rotate;
17602 }
17603
17604 // If the shuffle mask is repeated in each 128-bit lane we can use more
17605 // efficient instructions that mirror the shuffles across the four 128-bit
17606 // lanes.
17607 SmallVector<int, 4> RepeatedMask;
17608 bool Is128BitLaneRepeatedShuffle =
17609 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17610 if (Is128BitLaneRepeatedShuffle) {
17611 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17612 if (V2.isUndef())
17613 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17614 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17615
17616 // Use dedicated unpack instructions for masks that match their pattern.
17617 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17618 return V;
17619 }
17620
17621 // Try to use shift instructions.
17622 if (SDValue Shift =
17623 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17624 Subtarget, DAG, /*BitwiseOnly*/ false))
17625 return Shift;
17626
17627 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17628 if (SDValue Rotate =
17629 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17630 return Rotate;
17631
17632 // Try to use VALIGN.
17633 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17634 Zeroable, Subtarget, DAG))
17635 return Rotate;
17636
17637 // Try to use byte rotation instructions.
17638 if (Subtarget.hasBWI())
17639 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17640 Subtarget, DAG))
17641 return Rotate;
17642
17643 // Assume that a single SHUFPS is faster than using a permv shuffle.
17644 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17645 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17646 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17647 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17648 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17649 CastV1, CastV2, DAG);
17650 return DAG.getBitcast(MVT::v16i32, ShufPS);
17651 }
17652
17653 // Try to create an in-lane repeating shuffle mask and then shuffle the
17654 // results into the target lanes.
17656 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17657 return V;
17658
17659 // If we have AVX512F support, we can use VEXPAND.
17660 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17661 Zeroable, Subtarget, DAG))
17662 return V;
17663
17664 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17665 Zeroable, Subtarget, DAG))
17666 return Blend;
17667
17668 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17669}
17670
17671/// Handle lowering of 32-lane 16-bit integer shuffles.
17673 const APInt &Zeroable, SDValue V1, SDValue V2,
17674 const X86Subtarget &Subtarget,
17675 SelectionDAG &DAG) {
17676 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17677 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17678 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17679 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17680
17681 // Whenever we can lower this as a zext, that instruction is strictly faster
17682 // than any alternative. It also allows us to fold memory operands into the
17683 // shuffle in many cases.
17685 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17686 return ZExt;
17687
17688 // Use dedicated unpack instructions for masks that match their pattern.
17689 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17690 return V;
17691
17692 // Use dedicated pack instructions for masks that match their pattern.
17693 if (SDValue V =
17694 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17695 return V;
17696
17697 // Try to use shift instructions.
17698 if (SDValue Shift =
17699 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17700 Subtarget, DAG, /*BitwiseOnly*/ false))
17701 return Shift;
17702
17703 // Try to use byte rotation instructions.
17704 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17705 Subtarget, DAG))
17706 return Rotate;
17707
17708 if (V2.isUndef()) {
17709 // Try to use bit rotation instructions.
17710 if (SDValue Rotate =
17711 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17712 return Rotate;
17713
17714 SmallVector<int, 8> RepeatedMask;
17715 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17716 // As this is a single-input shuffle, the repeated mask should be
17717 // a strictly valid v8i16 mask that we can pass through to the v8i16
17718 // lowering to handle even the v32 case.
17719 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17720 RepeatedMask, Subtarget, DAG);
17721 }
17722 }
17723
17724 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17725 Zeroable, Subtarget, DAG))
17726 return Blend;
17727
17728 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17729 Zeroable, Subtarget, DAG))
17730 return PSHUFB;
17731
17732 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17733 // shuffle.
17734 if (!V2.isUndef())
17736 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17737 return Result;
17738
17739 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17740}
17741
17742/// Handle lowering of 64-lane 8-bit integer shuffles.
17744 const APInt &Zeroable, SDValue V1, SDValue V2,
17745 const X86Subtarget &Subtarget,
17746 SelectionDAG &DAG) {
17747 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17748 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17749 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17750 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17751
17752 // Whenever we can lower this as a zext, that instruction is strictly faster
17753 // than any alternative. It also allows us to fold memory operands into the
17754 // shuffle in many cases.
17756 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17757 return ZExt;
17758
17759 // Use dedicated unpack instructions for masks that match their pattern.
17760 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17761 return V;
17762
17763 // Use dedicated pack instructions for masks that match their pattern.
17764 if (SDValue V =
17765 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17766 return V;
17767
17768 // Try to use shift instructions.
17769 if (SDValue Shift =
17770 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17771 DAG, /*BitwiseOnly*/ false))
17772 return Shift;
17773
17774 // Try to use byte rotation instructions.
17775 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17776 Subtarget, DAG))
17777 return Rotate;
17778
17779 // Try to use bit rotation instructions.
17780 if (V2.isUndef())
17781 if (SDValue Rotate =
17782 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17783 return Rotate;
17784
17785 // Lower as AND if possible.
17786 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17787 Zeroable, Subtarget, DAG))
17788 return Masked;
17789
17790 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17791 Zeroable, Subtarget, DAG))
17792 return PSHUFB;
17793
17794 // Try to create an in-lane repeating shuffle mask and then shuffle the
17795 // results into the target lanes.
17797 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17798 return V;
17799
17801 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17802 return Result;
17803
17804 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17805 Zeroable, Subtarget, DAG))
17806 return Blend;
17807
17808 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17809 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17810 // PALIGNR will be cheaper than the second PSHUFB+OR.
17811 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17812 Mask, Subtarget, DAG))
17813 return V;
17814
17815 // If we can't directly blend but can use PSHUFB, that will be better as it
17816 // can both shuffle and set up the inefficient blend.
17817 bool V1InUse, V2InUse;
17818 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17819 DAG, V1InUse, V2InUse);
17820 }
17821
17822 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17823 // shuffle.
17824 if (!V2.isUndef())
17826 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17827 return Result;
17828
17829 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17830 if (Subtarget.hasVBMI())
17831 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17832
17833 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17834}
17835
17836/// High-level routine to lower various 512-bit x86 vector shuffles.
17837///
17838/// This routine either breaks down the specific type of a 512-bit x86 vector
17839/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17840/// together based on the available instructions.
17842 MVT VT, SDValue V1, SDValue V2,
17843 const APInt &Zeroable,
17844 const X86Subtarget &Subtarget,
17845 SelectionDAG &DAG) {
17846 assert(Subtarget.hasAVX512() &&
17847 "Cannot lower 512-bit vectors w/ basic ISA!");
17848
17849 // If we have a single input to the zero element, insert that into V1 if we
17850 // can do so cheaply.
17851 int NumElts = Mask.size();
17852 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17853
17854 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17856 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17857 return Insertion;
17858
17859 // Handle special cases where the lower or upper half is UNDEF.
17860 if (SDValue V =
17861 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17862 return V;
17863
17864 // Check for being able to broadcast a single element.
17865 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17866 Subtarget, DAG))
17867 return Broadcast;
17868
17869 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17870 // Try using bit ops for masking and blending before falling back to
17871 // splitting.
17872 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17873 Subtarget, DAG))
17874 return V;
17875 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17876 return V;
17877
17878 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17879 }
17880
17881 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17882 if (!Subtarget.hasBWI())
17883 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17884 /*SimpleOnly*/ false);
17885
17886 V1 = DAG.getBitcast(MVT::v32i16, V1);
17887 V2 = DAG.getBitcast(MVT::v32i16, V2);
17888 return DAG.getBitcast(VT,
17889 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17890 }
17891
17892 // Dispatch to each element type for lowering. If we don't have support for
17893 // specific element type shuffles at 512 bits, immediately split them and
17894 // lower them. Each lowering routine of a given type is allowed to assume that
17895 // the requisite ISA extensions for that element type are available.
17896 switch (VT.SimpleTy) {
17897 case MVT::v8f64:
17898 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v16f32:
17900 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v8i64:
17902 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v16i32:
17904 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v32i16:
17906 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907 case MVT::v64i8:
17908 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17909
17910 default:
17911 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17912 }
17913}
17914
17916 MVT VT, SDValue V1, SDValue V2,
17917 const X86Subtarget &Subtarget,
17918 SelectionDAG &DAG) {
17919 // Shuffle should be unary.
17920 if (!V2.isUndef())
17921 return SDValue();
17922
17923 int ShiftAmt = -1;
17924 int NumElts = Mask.size();
17925 for (int i = 0; i != NumElts; ++i) {
17926 int M = Mask[i];
17927 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17928 "Unexpected mask index.");
17929 if (M < 0)
17930 continue;
17931
17932 // The first non-undef element determines our shift amount.
17933 if (ShiftAmt < 0) {
17934 ShiftAmt = M - i;
17935 // Need to be shifting right.
17936 if (ShiftAmt <= 0)
17937 return SDValue();
17938 }
17939 // All non-undef elements must shift by the same amount.
17940 if (ShiftAmt != M - i)
17941 return SDValue();
17942 }
17943 assert(ShiftAmt >= 0 && "All undef?");
17944
17945 // Great we found a shift right.
17946 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17947 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17948 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17949 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17950 DAG.getVectorIdxConstant(0, DL));
17951}
17952
17953// Determine if this shuffle can be implemented with a KSHIFT instruction.
17954// Returns the shift amount if possible or -1 if not. This is a simplified
17955// version of matchShuffleAsShift.
17956static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17957 int MaskOffset, const APInt &Zeroable) {
17958 int Size = Mask.size();
17959
17960 auto CheckZeros = [&](int Shift, bool Left) {
17961 for (int j = 0; j < Shift; ++j)
17962 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17963 return false;
17964
17965 return true;
17966 };
17967
17968 auto MatchShift = [&](int Shift, bool Left) {
17969 unsigned Pos = Left ? Shift : 0;
17970 unsigned Low = Left ? 0 : Shift;
17971 unsigned Len = Size - Shift;
17972 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17973 };
17974
17975 for (int Shift = 1; Shift != Size; ++Shift)
17976 for (bool Left : {true, false})
17977 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17979 return Shift;
17980 }
17981
17982 return -1;
17983}
17984
17985
17986// Lower vXi1 vector shuffles.
17987// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17988// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17989// vector, shuffle and then truncate it back.
17991 MVT VT, SDValue V1, SDValue V2,
17992 const APInt &Zeroable,
17993 const X86Subtarget &Subtarget,
17994 SelectionDAG &DAG) {
17995 assert(Subtarget.hasAVX512() &&
17996 "Cannot lower 512-bit vectors w/o basic ISA!");
17997
17998 int NumElts = Mask.size();
17999 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18000
18001 // Try to recognize shuffles that are just padding a subvector with zeros.
18002 int SubvecElts = 0;
18003 int Src = -1;
18004 for (int i = 0; i != NumElts; ++i) {
18005 if (Mask[i] >= 0) {
18006 // Grab the source from the first valid mask. All subsequent elements need
18007 // to use this same source.
18008 if (Src < 0)
18009 Src = Mask[i] / NumElts;
18010 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18011 break;
18012 }
18013
18014 ++SubvecElts;
18015 }
18016 assert(SubvecElts != NumElts && "Identity shuffle?");
18017
18018 // Clip to a power 2.
18019 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18020
18021 // Make sure the number of zeroable bits in the top at least covers the bits
18022 // not covered by the subvector.
18023 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18024 assert(Src >= 0 && "Expected a source!");
18025 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18026 SDValue Extract =
18027 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18028 DAG.getVectorIdxConstant(0, DL));
18029 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18030 DAG.getConstant(0, DL, VT), Extract,
18031 DAG.getVectorIdxConstant(0, DL));
18032 }
18033
18034 // Try a simple shift right with undef elements. Later we'll try with zeros.
18035 if (SDValue Shift =
18036 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18037 return Shift;
18038
18039 // Try to match KSHIFTs.
18040 unsigned Offset = 0;
18041 for (SDValue V : {V1, V2}) {
18042 unsigned Opcode;
18043 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18044 if (ShiftAmt >= 0) {
18045 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18046 MVT WideVT = Res.getSimpleValueType();
18047 // Widened right shifts need two shifts to ensure we shift in zeroes.
18048 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18049 int WideElts = WideVT.getVectorNumElements();
18050 // Shift left to put the original vector in the MSBs of the new size.
18051 Res =
18052 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18053 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18054 // Increase the shift amount to account for the left shift.
18055 ShiftAmt += WideElts - NumElts;
18056 }
18057
18058 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18059 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18060 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18061 DAG.getVectorIdxConstant(0, DL));
18062 }
18063 Offset += NumElts; // Increment for next iteration.
18064 }
18065
18066 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18067 // ops instead.
18068 // TODO: What other unary shuffles would benefit from this?
18069 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18070 SDValue Op0 = V1.getOperand(0);
18071 SDValue Op1 = V1.getOperand(1);
18073 EVT OpVT = Op0.getValueType();
18074 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18075 return DAG.getSetCC(
18076 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18077 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18078 }
18079
18080 MVT ExtVT;
18081 switch (VT.SimpleTy) {
18082 default:
18083 llvm_unreachable("Expected a vector of i1 elements");
18084 case MVT::v2i1:
18085 ExtVT = MVT::v2i64;
18086 break;
18087 case MVT::v4i1:
18088 ExtVT = MVT::v4i32;
18089 break;
18090 case MVT::v8i1:
18091 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18092 // shuffle.
18093 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18094 break;
18095 case MVT::v16i1:
18096 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18097 // 256-bit operation available.
18098 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18099 break;
18100 case MVT::v32i1:
18101 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18102 // 256-bit operation available.
18103 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18104 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18105 break;
18106 case MVT::v64i1:
18107 // Fall back to scalarization. FIXME: We can do better if the shuffle
18108 // can be partitioned cleanly.
18109 if (!Subtarget.useBWIRegs())
18110 return SDValue();
18111 ExtVT = MVT::v64i8;
18112 break;
18113 }
18114
18115 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18116 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18117
18118 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18119 // i1 was sign extended we can use X86ISD::CVT2MASK.
18120 int NumElems = VT.getVectorNumElements();
18121 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18122 (Subtarget.hasDQI() && (NumElems < 32)))
18123 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18124 Shuffle, ISD::SETGT);
18125
18126 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18127}
18128
18129/// Helper function that returns true if the shuffle mask should be
18130/// commuted to improve canonicalization.
18132 int NumElements = Mask.size();
18133
18134 int NumV1Elements = 0, NumV2Elements = 0;
18135 for (int M : Mask)
18136 if (M < 0)
18137 continue;
18138 else if (M < NumElements)
18139 ++NumV1Elements;
18140 else
18141 ++NumV2Elements;
18142
18143 // Commute the shuffle as needed such that more elements come from V1 than
18144 // V2. This allows us to match the shuffle pattern strictly on how many
18145 // elements come from V1 without handling the symmetric cases.
18146 if (NumV2Elements > NumV1Elements)
18147 return true;
18148
18149 assert(NumV1Elements > 0 && "No V1 indices");
18150
18151 if (NumV2Elements == 0)
18152 return false;
18153
18154 // When the number of V1 and V2 elements are the same, try to minimize the
18155 // number of uses of V2 in the low half of the vector. When that is tied,
18156 // ensure that the sum of indices for V1 is equal to or lower than the sum
18157 // indices for V2. When those are equal, try to ensure that the number of odd
18158 // indices for V1 is lower than the number of odd indices for V2.
18159 if (NumV1Elements == NumV2Elements) {
18160 int LowV1Elements = 0, LowV2Elements = 0;
18161 for (int M : Mask.slice(0, NumElements / 2))
18162 if (M >= NumElements)
18163 ++LowV2Elements;
18164 else if (M >= 0)
18165 ++LowV1Elements;
18166 if (LowV2Elements > LowV1Elements)
18167 return true;
18168 if (LowV2Elements == LowV1Elements) {
18169 int SumV1Indices = 0, SumV2Indices = 0;
18170 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18171 if (Mask[i] >= NumElements)
18172 SumV2Indices += i;
18173 else if (Mask[i] >= 0)
18174 SumV1Indices += i;
18175 if (SumV2Indices < SumV1Indices)
18176 return true;
18177 if (SumV2Indices == SumV1Indices) {
18178 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18179 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18180 if (Mask[i] >= NumElements)
18181 NumV2OddIndices += i % 2;
18182 else if (Mask[i] >= 0)
18183 NumV1OddIndices += i % 2;
18184 if (NumV2OddIndices < NumV1OddIndices)
18185 return true;
18186 }
18187 }
18188 }
18189
18190 return false;
18191}
18192
18194 const X86Subtarget &Subtarget) {
18195 if (!Subtarget.hasAVX512())
18196 return false;
18197
18198 if (!V.getValueType().isSimple())
18199 return false;
18200
18201 MVT VT = V.getSimpleValueType().getScalarType();
18202 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18203 return false;
18204
18205 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18206 // are preferable to blendw/blendvb/masked-mov.
18207 if ((VT == MVT::i16 || VT == MVT::i8) &&
18208 V.getSimpleValueType().getSizeInBits() < 512)
18209 return false;
18210
18211 auto HasMaskOperation = [&](SDValue V) {
18212 // TODO: Currently we only check limited opcode. We probably extend
18213 // it to all binary operation by checking TLI.isBinOp().
18214 switch (V->getOpcode()) {
18215 default:
18216 return false;
18217 case ISD::ADD:
18218 case ISD::SUB:
18219 case ISD::AND:
18220 case ISD::XOR:
18221 case ISD::OR:
18222 case ISD::SMAX:
18223 case ISD::SMIN:
18224 case ISD::UMAX:
18225 case ISD::UMIN:
18226 case ISD::ABS:
18227 case ISD::SHL:
18228 case ISD::SRL:
18229 case ISD::SRA:
18230 case ISD::MUL:
18231 break;
18232 }
18233 if (!V->hasOneUse())
18234 return false;
18235
18236 return true;
18237 };
18238
18239 if (HasMaskOperation(V))
18240 return true;
18241
18242 return false;
18243}
18244
18245// Forward declaration.
18248 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18249 const X86Subtarget &Subtarget);
18250
18251 /// Top-level lowering for x86 vector shuffles.
18252///
18253/// This handles decomposition, canonicalization, and lowering of all x86
18254/// vector shuffles. Most of the specific lowering strategies are encapsulated
18255/// above in helper routines. The canonicalization attempts to widen shuffles
18256/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18257/// s.t. only one of the two inputs needs to be tested, etc.
18259 SelectionDAG &DAG) {
18261 ArrayRef<int> OrigMask = SVOp->getMask();
18262 SDValue V1 = Op.getOperand(0);
18263 SDValue V2 = Op.getOperand(1);
18264 MVT VT = Op.getSimpleValueType();
18265 int NumElements = VT.getVectorNumElements();
18266 SDLoc DL(Op);
18267 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18268
18269 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18270 "Can't lower MMX shuffles");
18271
18272 bool V1IsUndef = V1.isUndef();
18273 bool V2IsUndef = V2.isUndef();
18274 if (V1IsUndef && V2IsUndef)
18275 return DAG.getUNDEF(VT);
18276
18277 // When we create a shuffle node we put the UNDEF node to second operand,
18278 // but in some cases the first operand may be transformed to UNDEF.
18279 // In this case we should just commute the node.
18280 if (V1IsUndef)
18281 return DAG.getCommutedVectorShuffle(*SVOp);
18282
18283 // Check for non-undef masks pointing at an undef vector and make the masks
18284 // undef as well. This makes it easier to match the shuffle based solely on
18285 // the mask.
18286 if (V2IsUndef &&
18287 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18288 SmallVector<int, 8> NewMask(OrigMask);
18289 for (int &M : NewMask)
18290 if (M >= NumElements)
18291 M = -1;
18292 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18293 }
18294
18295 // Check for illegal shuffle mask element index values.
18296 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18297 (void)MaskUpperLimit;
18298 assert(llvm::all_of(OrigMask,
18299 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18300 "Out of bounds shuffle index");
18301
18302 // We actually see shuffles that are entirely re-arrangements of a set of
18303 // zero inputs. This mostly happens while decomposing complex shuffles into
18304 // simple ones. Directly lower these as a buildvector of zeros.
18305 APInt KnownUndef, KnownZero;
18306 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18307
18308 APInt Zeroable = KnownUndef | KnownZero;
18309 if (Zeroable.isAllOnes())
18310 return getZeroVector(VT, Subtarget, DAG, DL);
18311
18312 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18313
18314 // Try to collapse shuffles into using a vector type with fewer elements but
18315 // wider element types. We cap this to not form integers or floating point
18316 // elements wider than 64 bits. It does not seem beneficial to form i128
18317 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18318 SmallVector<int, 16> WidenedMask;
18319 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18320 !canCombineAsMaskOperation(V1, Subtarget) &&
18321 !canCombineAsMaskOperation(V2, Subtarget) &&
18322 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18323 // Shuffle mask widening should not interfere with a broadcast opportunity
18324 // by obfuscating the operands with bitcasts.
18325 // TODO: Avoid lowering directly from this top-level function: make this
18326 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18327 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18328 Subtarget, DAG))
18329 return Broadcast;
18330
18331 MVT NewEltVT = VT.isFloatingPoint()
18334 int NewNumElts = NumElements / 2;
18335 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18336 // Make sure that the new vector type is legal. For example, v2f64 isn't
18337 // legal on SSE1.
18338 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18339 if (V2IsZero) {
18340 // Modify the new Mask to take all zeros from the all-zero vector.
18341 // Choose indices that are blend-friendly.
18342 bool UsedZeroVector = false;
18343 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18344 "V2's non-undef elements are used?!");
18345 for (int i = 0; i != NewNumElts; ++i)
18346 if (WidenedMask[i] == SM_SentinelZero) {
18347 WidenedMask[i] = i + NewNumElts;
18348 UsedZeroVector = true;
18349 }
18350 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18351 // some elements to be undef.
18352 if (UsedZeroVector)
18353 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18354 }
18355 V1 = DAG.getBitcast(NewVT, V1);
18356 V2 = DAG.getBitcast(NewVT, V2);
18357 return DAG.getBitcast(
18358 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18359 }
18360 }
18361
18362 SmallVector<SDValue> Ops = {V1, V2};
18363 SmallVector<int> Mask(OrigMask);
18364
18365 // Canonicalize the shuffle with any horizontal ops inputs.
18366 // NOTE: This may update Ops and Mask.
18368 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18369 return DAG.getBitcast(VT, HOp);
18370
18371 V1 = DAG.getBitcast(VT, Ops[0]);
18372 V2 = DAG.getBitcast(VT, Ops[1]);
18373 assert(NumElements == (int)Mask.size() &&
18374 "canonicalizeShuffleMaskWithHorizOp "
18375 "shouldn't alter the shuffle mask size");
18376
18377 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18378 // These will be materialized uniformly anyway, so make splat matching easier.
18379 // TODO: Allow all int constants?
18380 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18381 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18382 BitVector Undefs;
18383 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18384 if (Undefs.any() &&
18387 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18388 }
18389 }
18390 }
18391 return V;
18392 };
18393 V1 = CanonicalizeConstant(V1);
18394 V2 = CanonicalizeConstant(V2);
18395
18396 // Commute the shuffle if it will improve canonicalization.
18399 std::swap(V1, V2);
18400 }
18401
18402 // For each vector width, delegate to a specialized lowering routine.
18403 if (VT.is128BitVector())
18404 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18405
18406 if (VT.is256BitVector())
18407 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18408
18409 if (VT.is512BitVector())
18410 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18411
18412 if (Is1BitVector)
18413 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18414
18415 llvm_unreachable("Unimplemented!");
18416}
18417
18418// As legal vpcompress instructions depend on various AVX512 extensions, try to
18419// convert illegal vector sizes to legal ones to avoid expansion.
18421 SelectionDAG &DAG) {
18422 assert(Subtarget.hasAVX512() &&
18423 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18424
18425 SDLoc DL(Op);
18426 SDValue Vec = Op.getOperand(0);
18427 SDValue Mask = Op.getOperand(1);
18428 SDValue Passthru = Op.getOperand(2);
18429
18430 EVT VecVT = Vec.getValueType();
18431 EVT ElementVT = VecVT.getVectorElementType();
18432 unsigned NumElements = VecVT.getVectorNumElements();
18433 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18434 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18435
18436 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18437 // compressed as 512-bit vectors in AVX512F.
18438 if (NumVecBits != 128 && NumVecBits != 256)
18439 return SDValue();
18440
18441 if (NumElementBits == 32 || NumElementBits == 64) {
18442 unsigned NumLargeElements = 512 / NumElementBits;
18443 MVT LargeVecVT =
18444 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18445 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18446
18447 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18448 DAG, DL);
18449 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18450 Subtarget, DAG, DL);
18451 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18452 : widenSubVector(LargeVecVT, Passthru,
18453 /*ZeroNewElements=*/false,
18454 Subtarget, DAG, DL);
18455
18456 SDValue Compressed =
18457 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18458 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18459 DAG.getConstant(0, DL, MVT::i64));
18460 }
18461
18462 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18463 VecVT == MVT::v16i16) {
18464 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18465 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18466
18467 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18468 Passthru = Passthru.isUndef()
18469 ? DAG.getUNDEF(LargeVecVT)
18470 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18471
18472 SDValue Compressed =
18473 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18474 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18475 }
18476
18477 return SDValue();
18478}
18479
18480/// Try to lower a VSELECT instruction to a vector shuffle.
18482 const X86Subtarget &Subtarget,
18483 SelectionDAG &DAG) {
18484 SDValue Cond = Op.getOperand(0);
18485 SDValue LHS = Op.getOperand(1);
18486 SDValue RHS = Op.getOperand(2);
18487 MVT VT = Op.getSimpleValueType();
18488
18489 // Only non-legal VSELECTs reach this lowering, convert those into generic
18490 // shuffles and re-use the shuffle lowering path for blends.
18494 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18495 }
18496
18497 return SDValue();
18498}
18499
18500SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18501 SDValue Cond = Op.getOperand(0);
18502 SDValue LHS = Op.getOperand(1);
18503 SDValue RHS = Op.getOperand(2);
18504
18505 SDLoc dl(Op);
18506 MVT VT = Op.getSimpleValueType();
18507 if (isSoftF16(VT, Subtarget)) {
18508 MVT NVT = VT.changeVectorElementTypeToInteger();
18509 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18510 DAG.getBitcast(NVT, LHS),
18511 DAG.getBitcast(NVT, RHS)));
18512 }
18513
18514 // A vselect where all conditions and data are constants can be optimized into
18515 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18519 return SDValue();
18520
18521 // Try to lower this to a blend-style vector shuffle. This can handle all
18522 // constant condition cases.
18523 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18524 return BlendOp;
18525
18526 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18527 // with patterns on the mask registers on AVX-512.
18528 MVT CondVT = Cond.getSimpleValueType();
18529 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18530 if (CondEltSize == 1)
18531 return Op;
18532
18533 // Variable blends are only legal from SSE4.1 onward.
18534 if (!Subtarget.hasSSE41())
18535 return SDValue();
18536
18537 unsigned EltSize = VT.getScalarSizeInBits();
18538 unsigned NumElts = VT.getVectorNumElements();
18539
18540 // Expand v32i16/v64i8 without BWI.
18541 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18542 return SDValue();
18543
18544 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18545 // into an i1 condition so that we can use the mask-based 512-bit blend
18546 // instructions.
18547 if (VT.getSizeInBits() == 512) {
18548 // Build a mask by testing the condition against zero.
18549 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18550 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18551 DAG.getConstant(0, dl, CondVT),
18552 ISD::SETNE);
18553 // Now return a new VSELECT using the mask.
18554 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18555 }
18556
18557 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18558 if (CondEltSize != EltSize) {
18559 // If we don't have a sign splat, rely on the expansion.
18560 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18561 return SDValue();
18562
18563 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18564 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18565 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18566 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18567 }
18568
18569 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18570 // are free to split, then better to split before expanding the
18571 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18572 // TODO: This is very similar to narrowVectorSelect.
18573 // TODO: Add Load splitting to isFreeToSplitVector ?
18574 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18575 !Subtarget.hasXOP()) {
18576 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18577 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18578 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18579 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18580 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18581 if (FreeCond && (FreeLHS || FreeRHS))
18582 return splitVectorOp(Op, DAG, dl);
18583 }
18584
18585 // Only some types will be legal on some subtargets. If we can emit a legal
18586 // VSELECT-matching blend, return Op, and but if we need to expand, return
18587 // a null value.
18588 switch (VT.SimpleTy) {
18589 default:
18590 // Most of the vector types have blends past SSE4.1.
18591 return Op;
18592
18593 case MVT::v32i8:
18594 // The byte blends for AVX vectors were introduced only in AVX2.
18595 if (Subtarget.hasAVX2())
18596 return Op;
18597
18598 return SDValue();
18599
18600 case MVT::v8i16:
18601 case MVT::v16i16:
18602 case MVT::v8f16:
18603 case MVT::v16f16: {
18604 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18605 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18606 Cond = DAG.getBitcast(CastVT, Cond);
18607 LHS = DAG.getBitcast(CastVT, LHS);
18608 RHS = DAG.getBitcast(CastVT, RHS);
18609 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18610 return DAG.getBitcast(VT, Select);
18611 }
18612 }
18613}
18614
18616 MVT VT = Op.getSimpleValueType();
18617 SDValue Vec = Op.getOperand(0);
18618 SDValue Idx = Op.getOperand(1);
18619 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18620 SDLoc dl(Op);
18621
18623 return SDValue();
18624
18625 if (VT.getSizeInBits() == 8) {
18626 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18627 // we're going to zero extend the register or fold the store.
18630 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18631 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18632 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18633
18634 unsigned IdxVal = Idx->getAsZExtVal();
18635 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18636 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18637 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18638 }
18639
18640 if (VT == MVT::f32) {
18641 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18642 // the result back to FR32 register. It's only worth matching if the
18643 // result has a single use which is a store or a bitcast to i32. And in
18644 // the case of a store, it's not worth it if the index is a constant 0,
18645 // because a MOVSSmr can be used instead, which is smaller and faster.
18646 if (!Op.hasOneUse())
18647 return SDValue();
18648 SDNode *User = *Op.getNode()->user_begin();
18649 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18650 (User->getOpcode() != ISD::BITCAST ||
18651 User->getValueType(0) != MVT::i32))
18652 return SDValue();
18653 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18654 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18655 return DAG.getBitcast(MVT::f32, Extract);
18656 }
18657
18658 if (VT == MVT::i32 || VT == MVT::i64)
18659 return Op;
18660
18661 return SDValue();
18662}
18663
18664/// Extract one bit from mask vector, like v16i1 or v8i1.
18665/// AVX-512 feature.
18667 const X86Subtarget &Subtarget) {
18668 SDValue Vec = Op.getOperand(0);
18669 SDLoc dl(Vec);
18670 MVT VecVT = Vec.getSimpleValueType();
18671 SDValue Idx = Op.getOperand(1);
18672 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18673 MVT EltVT = Op.getSimpleValueType();
18674
18675 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18676 "Unexpected vector type in ExtractBitFromMaskVector");
18677
18678 // variable index can't be handled in mask registers,
18679 // extend vector to VR512/128
18680 if (!IdxC) {
18681 unsigned NumElts = VecVT.getVectorNumElements();
18682 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18683 // than extending to 128/256bit.
18684 if (NumElts == 1) {
18685 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18687 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18688 }
18689 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18690 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18691 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18692 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18693 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18694 }
18695
18696 unsigned IdxVal = IdxC->getZExtValue();
18697 if (IdxVal == 0) // the operation is legal
18698 return Op;
18699
18700 // Extend to natively supported kshift.
18701 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18702
18703 // Use kshiftr instruction to move to the lower element.
18704 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18705 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18706
18707 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18708 DAG.getVectorIdxConstant(0, dl));
18709}
18710
18711// Helper to find all the extracted elements from a vector.
18713 MVT VT = N->getSimpleValueType(0);
18714 unsigned NumElts = VT.getVectorNumElements();
18715 APInt DemandedElts = APInt::getZero(NumElts);
18716 for (SDNode *User : N->users()) {
18717 switch (User->getOpcode()) {
18718 case X86ISD::PEXTRB:
18719 case X86ISD::PEXTRW:
18722 DemandedElts.setAllBits();
18723 return DemandedElts;
18724 }
18725 DemandedElts.setBit(User->getConstantOperandVal(1));
18726 break;
18727 case ISD::BITCAST: {
18728 if (!User->getValueType(0).isSimple() ||
18729 !User->getValueType(0).isVector()) {
18730 DemandedElts.setAllBits();
18731 return DemandedElts;
18732 }
18733 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18734 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18735 break;
18736 }
18737 default:
18738 DemandedElts.setAllBits();
18739 return DemandedElts;
18740 }
18741 }
18742 return DemandedElts;
18743}
18744
18745SDValue
18746X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18747 SelectionDAG &DAG) const {
18748 SDLoc dl(Op);
18749 SDValue Vec = Op.getOperand(0);
18750 MVT VecVT = Vec.getSimpleValueType();
18751 SDValue Idx = Op.getOperand(1);
18752 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18753
18754 if (VecVT.getVectorElementType() == MVT::i1)
18755 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18756
18757 if (!IdxC) {
18758 // Its more profitable to go through memory (1 cycles throughput)
18759 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18760 // IACA tool was used to get performance estimation
18761 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18762 //
18763 // example : extractelement <16 x i8> %a, i32 %i
18764 //
18765 // Block Throughput: 3.00 Cycles
18766 // Throughput Bottleneck: Port5
18767 //
18768 // | Num Of | Ports pressure in cycles | |
18769 // | Uops | 0 - DV | 5 | 6 | 7 | |
18770 // ---------------------------------------------
18771 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18772 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18773 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18774 // Total Num Of Uops: 4
18775 //
18776 //
18777 // Block Throughput: 1.00 Cycles
18778 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18779 //
18780 // | | Ports pressure in cycles | |
18781 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18782 // ---------------------------------------------------------
18783 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18784 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18785 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18786 // Total Num Of Uops: 4
18787
18788 return SDValue();
18789 }
18790
18791 unsigned IdxVal = IdxC->getZExtValue();
18792
18793 // If this is a 256-bit vector result, first extract the 128-bit vector and
18794 // then extract the element from the 128-bit vector.
18795 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18796 // Get the 128-bit vector.
18797 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18798 MVT EltVT = VecVT.getVectorElementType();
18799
18800 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18801 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18802
18803 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18804 // this can be done with a mask.
18805 IdxVal &= ElemsPerChunk - 1;
18806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18807 DAG.getVectorIdxConstant(IdxVal, dl));
18808 }
18809
18810 assert(VecVT.is128BitVector() && "Unexpected vector length");
18811
18812 MVT VT = Op.getSimpleValueType();
18813
18814 if (VT == MVT::i16) {
18815 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18816 // we're going to zero extend the register or fold the store (SSE41 only).
18817 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18818 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18819 if (Subtarget.hasFP16())
18820 return Op;
18821
18822 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18823 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18824 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18825 }
18826
18827 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18828 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18829 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18830 }
18831
18832 if (Subtarget.hasSSE41())
18833 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18834 return Res;
18835
18836 // Only extract a single element from a v16i8 source - determine the common
18837 // DWORD/WORD that all extractions share, and extract the sub-byte.
18838 // TODO: Add QWORD MOVQ extraction?
18839 if (VT == MVT::i8) {
18840 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18841 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18842
18843 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18844 int DWordIdx = IdxVal / 4;
18845 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18846 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18847 DAG.getBitcast(MVT::v4i32, Vec),
18848 DAG.getVectorIdxConstant(DWordIdx, dl));
18849 int ShiftVal = (IdxVal % 4) * 8;
18850 if (ShiftVal != 0)
18851 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18852 DAG.getConstant(ShiftVal, dl, MVT::i8));
18853 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18854 }
18855
18856 int WordIdx = IdxVal / 2;
18857 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18858 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18859 DAG.getBitcast(MVT::v8i16, Vec),
18860 DAG.getVectorIdxConstant(WordIdx, dl));
18861 int ShiftVal = (IdxVal % 2) * 8;
18862 if (ShiftVal != 0)
18863 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18864 DAG.getConstant(ShiftVal, dl, MVT::i8));
18865 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18866 }
18867 }
18868
18869 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18870 if (IdxVal == 0)
18871 return Op;
18872
18873 // Shuffle the element to the lowest element, then movss or movsh.
18874 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18875 Mask[0] = static_cast<int>(IdxVal);
18876 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18877 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18878 DAG.getVectorIdxConstant(0, dl));
18879 }
18880
18881 if (VT.getSizeInBits() == 64) {
18882 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18883 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18884 // to match extract_elt for f64.
18885 if (IdxVal == 0)
18886 return Op;
18887
18888 // UNPCKHPD the element to the lowest double word, then movsd.
18889 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18890 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18891 int Mask[2] = { 1, -1 };
18892 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18893 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18894 DAG.getVectorIdxConstant(0, dl));
18895 }
18896
18897 return SDValue();
18898}
18899
18900/// Insert one bit to mask vector, like v16i1 or v8i1.
18901/// AVX-512 feature.
18903 const X86Subtarget &Subtarget) {
18904 SDLoc dl(Op);
18905 SDValue Vec = Op.getOperand(0);
18906 SDValue Elt = Op.getOperand(1);
18907 SDValue Idx = Op.getOperand(2);
18908 MVT VecVT = Vec.getSimpleValueType();
18909
18910 if (!isa<ConstantSDNode>(Idx)) {
18911 // Non constant index. Extend source and destination,
18912 // insert element and then truncate the result.
18913 unsigned NumElts = VecVT.getVectorNumElements();
18914 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18915 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18916 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18917 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18918 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18919 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18920 }
18921
18922 // Copy into a k-register, extract to v1i1 and insert_subvector.
18923 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18924 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18925}
18926
18927SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18928 SelectionDAG &DAG) const {
18929 MVT VT = Op.getSimpleValueType();
18930 MVT EltVT = VT.getVectorElementType();
18931 unsigned NumElts = VT.getVectorNumElements();
18932 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18933
18934 if (EltVT == MVT::i1)
18935 return InsertBitToMaskVector(Op, DAG, Subtarget);
18936
18937 SDLoc dl(Op);
18938 SDValue N0 = Op.getOperand(0);
18939 SDValue N1 = Op.getOperand(1);
18940 SDValue N2 = Op.getOperand(2);
18941 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18942
18943 if (EltVT == MVT::bf16) {
18944 MVT IVT = VT.changeVectorElementTypeToInteger();
18945 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18946 DAG.getBitcast(IVT, N0),
18947 DAG.getBitcast(MVT::i16, N1), N2);
18948 return DAG.getBitcast(VT, Res);
18949 }
18950
18951 if (!N2C) {
18952 // Variable insertion indices, usually we're better off spilling to stack,
18953 // but AVX512 can use a variable compare+select by comparing against all
18954 // possible vector indices, and FP insertion has less gpr->simd traffic.
18955 if (!(Subtarget.hasBWI() ||
18956 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18957 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18958 return SDValue();
18959
18960 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18961 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18962 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18963 return SDValue();
18964
18965 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18966 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18967 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18968
18969 SmallVector<SDValue, 16> RawIndices;
18970 for (unsigned I = 0; I != NumElts; ++I)
18971 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18972 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18973
18974 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18975 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18977 }
18978
18979 if (N2C->getAPIntValue().uge(NumElts))
18980 return SDValue();
18981 uint64_t IdxVal = N2C->getZExtValue();
18982
18983 bool IsZeroElt = X86::isZeroNode(N1);
18984 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18985
18986 if (IsZeroElt || IsAllOnesElt) {
18987 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18988 // We don't deal with i8 0 since it appears to be handled elsewhere.
18989 if (IsAllOnesElt &&
18990 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18991 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18992 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18993 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18994 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18995 CstVectorElts[IdxVal] = OnesCst;
18996 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18997 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18998 }
18999 // See if we can do this more efficiently with a blend shuffle with a
19000 // rematerializable vector.
19001 if (Subtarget.hasSSE41() &&
19002 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19003 SmallVector<int, 8> BlendMask;
19004 for (unsigned i = 0; i != NumElts; ++i)
19005 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19006 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19007 : getOnesVector(VT, DAG, dl);
19008 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19009 }
19010 }
19011
19012 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19013 // into that, and then insert the subvector back into the result.
19014 if (VT.is256BitVector() || VT.is512BitVector()) {
19015 // With a 256-bit vector, we can insert into the zero element efficiently
19016 // using a blend if we have AVX or AVX2 and the right data type.
19017 if (VT.is256BitVector() && IdxVal == 0) {
19018 // TODO: It is worthwhile to cast integer to floating point and back
19019 // and incur a domain crossing penalty if that's what we'll end up
19020 // doing anyway after extracting to a 128-bit vector.
19021 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19022 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19023 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19024 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19025 DAG.getTargetConstant(1, dl, MVT::i8));
19026 }
19027 }
19028
19029 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19030 assert(isPowerOf2_32(NumEltsIn128) &&
19031 "Vectors will always have power-of-two number of elements.");
19032
19033 // If we are not inserting into the low 128-bit vector chunk,
19034 // then prefer the broadcast+blend sequence.
19035 // FIXME: relax the profitability check iff all N1 uses are insertions.
19036 if (IdxVal >= NumEltsIn128 &&
19037 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19038 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19039 X86::mayFoldLoad(N1, Subtarget)))) {
19040 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19041 SmallVector<int, 8> BlendMask;
19042 for (unsigned i = 0; i != NumElts; ++i)
19043 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19044 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19045 }
19046
19047 // Get the desired 128-bit vector chunk.
19048 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19049
19050 // Insert the element into the desired chunk.
19051 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19052 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19053
19054 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19055 DAG.getVectorIdxConstant(IdxIn128, dl));
19056
19057 // Insert the changed part back into the bigger vector
19058 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19059 }
19060 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19061
19062 // This will be just movw/movd/movq/movsh/movss/movsd.
19063 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19064 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19065 EltVT == MVT::f16 || EltVT == MVT::i64) {
19066 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19067 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19068 }
19069
19070 // We can't directly insert an i8 or i16 into a vector, so zero extend
19071 // it to i32 first.
19072 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19073 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19074 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19075 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19076 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19077 return DAG.getBitcast(VT, N1);
19078 }
19079 }
19080
19081 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19082 // argument. SSE41 required for pinsrb.
19083 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19084 unsigned Opc;
19085 if (VT == MVT::v8i16) {
19086 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19088 } else {
19089 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19090 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19092 }
19093
19094 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19095 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19096 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19097 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19098 }
19099
19100 if (Subtarget.hasSSE41()) {
19101 if (EltVT == MVT::f32) {
19102 // Bits [7:6] of the constant are the source select. This will always be
19103 // zero here. The DAG Combiner may combine an extract_elt index into
19104 // these bits. For example (insert (extract, 3), 2) could be matched by
19105 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19106 // Bits [5:4] of the constant are the destination select. This is the
19107 // value of the incoming immediate.
19108 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19109 // combine either bitwise AND or insert of float 0.0 to set these bits.
19110
19111 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19112 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19113 // If this is an insertion of 32-bits into the low 32-bits of
19114 // a vector, we prefer to generate a blend with immediate rather
19115 // than an insertps. Blends are simpler operations in hardware and so
19116 // will always have equal or better performance than insertps.
19117 // But if optimizing for size and there's a load folding opportunity,
19118 // generate insertps because blendps does not have a 32-bit memory
19119 // operand form.
19120 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19121 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19122 DAG.getTargetConstant(1, dl, MVT::i8));
19123 }
19124 // Create this as a scalar to vector..
19125 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19126 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19127 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19128 }
19129
19130 // PINSR* works with constant index.
19131 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19132 return Op;
19133 }
19134
19135 return SDValue();
19136}
19137
19139 SelectionDAG &DAG) {
19140 SDLoc dl(Op);
19141 MVT OpVT = Op.getSimpleValueType();
19142
19143 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19144 // combines.
19145 if (X86::isZeroNode(Op.getOperand(0)))
19146 return getZeroVector(OpVT, Subtarget, DAG, dl);
19147
19148 // If this is a 256-bit vector result, first insert into a 128-bit
19149 // vector and then insert into the 256-bit vector.
19150 if (!OpVT.is128BitVector()) {
19151 // Insert into a 128-bit vector.
19152 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19154 OpVT.getVectorNumElements() / SizeFactor);
19155
19156 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19157
19158 // Insert the 128-bit vector.
19159 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19160 }
19161 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19162 "Expected an SSE type!");
19163
19164 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19165 // tblgen.
19166 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19167 return Op;
19168
19169 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19170 return DAG.getBitcast(
19171 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19172}
19173
19174// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19175// simple superregister reference or explicit instructions to insert
19176// the upper bits of a vector.
19178 SelectionDAG &DAG) {
19179 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19180
19181 return insert1BitVector(Op, DAG, Subtarget);
19182}
19183
19185 SelectionDAG &DAG) {
19186 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19187 "Only vXi1 extract_subvectors need custom lowering");
19188
19189 SDLoc dl(Op);
19190 SDValue Vec = Op.getOperand(0);
19191 uint64_t IdxVal = Op.getConstantOperandVal(1);
19192
19193 if (IdxVal == 0) // the operation is legal
19194 return Op;
19195
19196 // Extend to natively supported kshift.
19197 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19198
19199 // Shift to the LSB.
19200 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19201 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19202
19203 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19204 DAG.getVectorIdxConstant(0, dl));
19205}
19206
19207// Returns the appropriate wrapper opcode for a global reference.
19208unsigned X86TargetLowering::getGlobalWrapperKind(
19209 const GlobalValue *GV, const unsigned char OpFlags) const {
19210 // References to absolute symbols are never PC-relative.
19211 if (GV && GV->isAbsoluteSymbolRef())
19212 return X86ISD::Wrapper;
19213
19214 // The following OpFlags under RIP-rel PIC use RIP.
19215 if (Subtarget.isPICStyleRIPRel() &&
19216 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19217 OpFlags == X86II::MO_DLLIMPORT))
19218 return X86ISD::WrapperRIP;
19219
19220 // GOTPCREL references must always use RIP.
19221 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19222 return X86ISD::WrapperRIP;
19223
19224 return X86ISD::Wrapper;
19225}
19226
19227// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19228// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19229// one of the above mentioned nodes. It has to be wrapped because otherwise
19230// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19231// be used to form addressing mode. These wrapped nodes will be selected
19232// into MOV32ri.
19233SDValue
19234X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19235 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19236
19237 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19238 // global base reg.
19239 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19240
19241 auto PtrVT = getPointerTy(DAG.getDataLayout());
19243 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19244 SDLoc DL(CP);
19245 Result =
19246 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19247 // With PIC, the address is actually $g + Offset.
19248 if (OpFlag) {
19249 Result =
19250 DAG.getNode(ISD::ADD, DL, PtrVT,
19251 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19252 }
19253
19254 return Result;
19255}
19256
19257SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19258 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19259
19260 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19261 // global base reg.
19262 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19263
19264 EVT PtrVT = Op.getValueType();
19265 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19266 SDLoc DL(JT);
19267 Result =
19268 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19269
19270 // With PIC, the address is actually $g + Offset.
19271 if (OpFlag)
19272 Result =
19273 DAG.getNode(ISD::ADD, DL, PtrVT,
19274 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19275
19276 return Result;
19277}
19278
19279SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19280 SelectionDAG &DAG) const {
19281 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19282}
19283
19284SDValue
19285X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19286 // Create the TargetBlockAddressAddress node.
19287 unsigned char OpFlags =
19288 Subtarget.classifyBlockAddressReference();
19289 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19290 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19291 SDLoc dl(Op);
19292 EVT PtrVT = Op.getValueType();
19293 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19294 Result =
19295 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19296
19297 // With PIC, the address is actually $g + Offset.
19298 if (isGlobalRelativeToPICBase(OpFlags)) {
19299 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19300 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19301 }
19302
19303 return Result;
19304}
19305
19306/// Creates target global address or external symbol nodes for calls or
19307/// other uses.
19308SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19309 bool ForCall,
19310 bool *IsImpCall) const {
19311 // Unpack the global address or external symbol.
19312 SDLoc dl(Op);
19313 const GlobalValue *GV = nullptr;
19314 int64_t Offset = 0;
19315 const char *ExternalSym = nullptr;
19316 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19317 GV = G->getGlobal();
19318 Offset = G->getOffset();
19319 } else {
19320 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19321 ExternalSym = ES->getSymbol();
19322 }
19323
19324 // Calculate some flags for address lowering.
19326 unsigned char OpFlags;
19327 if (ForCall)
19328 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19329 else
19330 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19331 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19332 bool NeedsLoad = isGlobalStubReference(OpFlags);
19333
19335 EVT PtrVT = Op.getValueType();
19337
19338 if (GV) {
19339 // Create a target global address if this is a global. If possible, fold the
19340 // offset into the global address reference. Otherwise, ADD it on later.
19341 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19342 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19343 // relocation will compute to a negative value, which is invalid.
19344 int64_t GlobalOffset = 0;
19345 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19347 std::swap(GlobalOffset, Offset);
19348 }
19349 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19350 } else {
19351 // If this is not a global address, this must be an external symbol.
19352 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19353 }
19354
19355 // If this is a direct call, avoid the wrapper if we don't need to do any
19356 // loads or adds. This allows SDAG ISel to match direct calls.
19357 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19358 return Result;
19359
19360 // If Import Call Optimization is enabled and this is an imported function
19361 // then make a note of it and return the global address without wrapping.
19362 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19363 Mod.getModuleFlag("import-call-optimization")) {
19364 assert(ForCall && "Should only enable import call optimization if we are "
19365 "lowering a call");
19366 *IsImpCall = true;
19367 return Result;
19368 }
19369
19370 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19371
19372 // With PIC, the address is actually $g + Offset.
19373 if (HasPICReg) {
19374 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19375 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19376 }
19377
19378 // For globals that require a load from a stub to get the address, emit the
19379 // load.
19380 if (NeedsLoad)
19381 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19383
19384 // If there was a non-zero offset that we didn't fold, create an explicit
19385 // addition for it.
19386 if (Offset != 0)
19387 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19388 DAG.getSignedConstant(Offset, dl, PtrVT));
19389
19390 return Result;
19391}
19392
19393SDValue
19394X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19395 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19396}
19397
19399 const EVT PtrVT, unsigned ReturnReg,
19400 unsigned char OperandFlags,
19401 bool LoadGlobalBaseReg = false,
19402 bool LocalDynamic = false) {
19404 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19405 SDLoc dl(GA);
19406 SDValue TGA;
19407 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19408 SDValue Chain = DAG.getEntryNode();
19409 SDValue Ret;
19410 if (LocalDynamic && UseTLSDESC) {
19411 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19412 // Reuse existing GetTLSADDR node if we can find it.
19413 if (TGA->hasOneUse()) {
19414 // TLSDESC uses TGA.
19415 SDNode *TLSDescOp = *TGA->user_begin();
19416 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19417 "Unexpected TLSDESC DAG");
19418 // CALLSEQ_END uses TGA via a chain and glue.
19419 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19420 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19421 "Unexpected TLSDESC DAG");
19422 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19423 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19424 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19425 "Unexpected TLSDESC DAG");
19426 Ret = SDValue(CopyFromRegOp, 0);
19427 }
19428 } else {
19429 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19430 GA->getOffset(), OperandFlags);
19431 }
19432
19433 if (!Ret) {
19434 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19435 : LocalDynamic ? X86ISD::TLSBASEADDR
19437
19438 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19439 if (LoadGlobalBaseReg) {
19440 SDValue InGlue;
19441 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19442 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19443 InGlue);
19444 InGlue = Chain.getValue(1);
19445 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19446 } else {
19447 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19448 }
19449 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19450
19451 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19452 MFI.setHasCalls(true);
19453
19454 SDValue Glue = Chain.getValue(1);
19455 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19456 }
19457
19458 if (!UseTLSDESC)
19459 return Ret;
19460
19461 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19462 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19463
19465 SDValue Offset =
19466 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19468 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19469}
19470
19471// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19472static SDValue
19474 const EVT PtrVT) {
19475 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19476 /*LoadGlobalBaseReg=*/true);
19477}
19478
19479// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19480static SDValue
19482 const EVT PtrVT) {
19483 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19484}
19485
19486// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19487static SDValue
19489 const EVT PtrVT) {
19490 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19491}
19492
19494 SelectionDAG &DAG, const EVT PtrVT,
19495 bool Is64Bit, bool Is64BitLP64) {
19496 SDLoc dl(GA);
19497
19498 // Get the start address of the TLS block for this module.
19502
19503 SDValue Base;
19504 if (Is64Bit) {
19505 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19506 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19507 /*LoadGlobalBaseReg=*/false,
19508 /*LocalDynamic=*/true);
19509 } else {
19510 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19511 /*LoadGlobalBaseReg=*/true,
19512 /*LocalDynamic=*/true);
19513 }
19514
19515 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19516 // of Base.
19517
19518 // Build x@dtpoff.
19519 unsigned char OperandFlags = X86II::MO_DTPOFF;
19520 unsigned WrapperKind = X86ISD::Wrapper;
19521 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19522 GA->getValueType(0),
19523 GA->getOffset(), OperandFlags);
19524 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19525
19526 // Add x@dtpoff with the base.
19527 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19528}
19529
19530// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19532 const EVT PtrVT, TLSModel::Model model,
19533 bool is64Bit, bool isPIC) {
19534 SDLoc dl(GA);
19535
19536 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19539
19540 SDValue ThreadPointer =
19541 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19543
19544 unsigned char OperandFlags = 0;
19545 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19546 // initialexec.
19547 unsigned WrapperKind = X86ISD::Wrapper;
19548 if (model == TLSModel::LocalExec) {
19549 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19550 } else if (model == TLSModel::InitialExec) {
19551 if (is64Bit) {
19552 OperandFlags = X86II::MO_GOTTPOFF;
19553 WrapperKind = X86ISD::WrapperRIP;
19554 } else {
19555 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19556 }
19557 } else {
19558 llvm_unreachable("Unexpected model");
19559 }
19560
19561 // emit "addl x@ntpoff,%eax" (local exec)
19562 // or "addl x@indntpoff,%eax" (initial exec)
19563 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19564 SDValue TGA =
19565 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19566 GA->getOffset(), OperandFlags);
19567 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19568
19569 if (model == TLSModel::InitialExec) {
19570 if (isPIC && !is64Bit) {
19571 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19572 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19573 Offset);
19574 }
19575
19576 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19578 }
19579
19580 // The address of the thread local variable is the add of the thread
19581 // pointer with the offset of the variable.
19582 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19583}
19584
19585SDValue
19586X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19587
19588 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19589
19590 if (DAG.getTarget().useEmulatedTLS())
19591 return LowerToTLSEmulatedModel(GA, DAG);
19592
19593 const GlobalValue *GV = GA->getGlobal();
19594 EVT PtrVT = Op.getValueType();
19595 bool PositionIndependent = isPositionIndependent();
19596
19597 if (Subtarget.isTargetELF()) {
19598 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19599 switch (model) {
19601 if (Subtarget.is64Bit()) {
19602 if (Subtarget.isTarget64BitLP64())
19603 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19604 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19605 }
19606 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19608 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19609 Subtarget.isTarget64BitLP64());
19612 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19613 PositionIndependent);
19614 }
19615 llvm_unreachable("Unknown TLS model.");
19616 }
19617
19618 if (Subtarget.isTargetDarwin()) {
19619 // Darwin only has one model of TLS. Lower to that.
19620 unsigned char OpFlag = 0;
19621 unsigned WrapperKind = 0;
19622
19623 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19624 // global base reg.
19625 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19626 if (PIC32) {
19627 OpFlag = X86II::MO_TLVP_PIC_BASE;
19628 WrapperKind = X86ISD::Wrapper;
19629 } else {
19630 OpFlag = X86II::MO_TLVP;
19631 WrapperKind = X86ISD::WrapperRIP;
19632 }
19633 SDLoc DL(Op);
19635 GA->getValueType(0),
19636 GA->getOffset(), OpFlag);
19637 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19638
19639 // With PIC32, the address is actually $g + Offset.
19640 if (PIC32)
19641 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19642 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19643 Offset);
19644
19645 // Lowering the machine isd will make sure everything is in the right
19646 // location.
19647 SDValue Chain = DAG.getEntryNode();
19648 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19649 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19650 SDValue Args[] = { Chain, Offset };
19651 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19652 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19653
19654 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19655 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19656 MFI.setAdjustsStack(true);
19657
19658 // And our return value (tls address) is in the standard call return value
19659 // location.
19660 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19661 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19662 }
19663
19664 if (Subtarget.isOSWindows()) {
19665 // Just use the implicit TLS architecture
19666 // Need to generate something similar to:
19667 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19668 // ; from TEB
19669 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19670 // mov rcx, qword [rdx+rcx*8]
19671 // mov eax, .tls$:tlsvar
19672 // [rax+rcx] contains the address
19673 // Windows 64bit: gs:0x58
19674 // Windows 32bit: fs:__tls_array
19675
19676 SDLoc dl(GA);
19677 SDValue Chain = DAG.getEntryNode();
19678
19679 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19680 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19681 // use its literal value of 0x2C.
19683 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19685
19686 SDValue TlsArray = Subtarget.is64Bit()
19687 ? DAG.getIntPtrConstant(0x58, dl)
19688 : (Subtarget.isTargetWindowsGNU()
19689 ? DAG.getIntPtrConstant(0x2C, dl)
19690 : DAG.getExternalSymbol("_tls_array", PtrVT));
19691
19693 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19694
19695 SDValue res;
19697 res = ThreadPointer;
19698 } else {
19699 // Load the _tls_index variable
19700 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19701 if (Subtarget.is64Bit())
19702 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19703 MachinePointerInfo(), MVT::i32);
19704 else
19705 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19706
19707 const DataLayout &DL = DAG.getDataLayout();
19708 SDValue Scale =
19709 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19710 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19711
19712 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19713 }
19714
19715 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19716
19717 // Get the offset of start of .tls section
19718 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19719 GA->getValueType(0),
19721 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19722
19723 // The address of the thread local variable is the add of the thread
19724 // pointer with the offset of the variable.
19725 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19726 }
19727
19728 llvm_unreachable("TLS not implemented for this target.");
19729}
19730
19732 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19733 const TargetMachine &TM = getTargetMachine();
19734 TLSModel::Model Model = TM.getTLSModel(&GV);
19735 switch (Model) {
19738 // We can include the %fs segment register in addressing modes.
19739 return true;
19742 // These models do not result in %fs relative addresses unless
19743 // TLS descriptior are used.
19744 //
19745 // Even in the case of TLS descriptors we currently have no way to model
19746 // the difference between %fs access and the computations needed for the
19747 // offset and returning `true` for TLS-desc currently duplicates both
19748 // which is detrimental :-/
19749 return false;
19750 }
19751 }
19752 return false;
19753}
19754
19755/// Lower SRA_PARTS and friends, which return two i32 values
19756/// and take a 2 x i32 value to shift plus a shift amount.
19757/// TODO: Can this be moved to general expansion code?
19759 SDValue Lo, Hi;
19760 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19761 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19762}
19763
19764// Try to use a packed vector operation to handle i64 on 32-bit targets when
19765// AVX512DQ is enabled.
19767 SelectionDAG &DAG,
19768 const X86Subtarget &Subtarget) {
19769 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19770 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19771 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19772 Op.getOpcode() == ISD::UINT_TO_FP) &&
19773 "Unexpected opcode!");
19774 bool IsStrict = Op->isStrictFPOpcode();
19775 unsigned OpNo = IsStrict ? 1 : 0;
19776 SDValue Src = Op.getOperand(OpNo);
19777 MVT SrcVT = Src.getSimpleValueType();
19778 MVT VT = Op.getSimpleValueType();
19779
19780 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19781 (VT != MVT::f32 && VT != MVT::f64))
19782 return SDValue();
19783
19784 // Pack the i64 into a vector, do the operation and extract.
19785
19786 // Using 256-bit to ensure result is 128-bits for f32 case.
19787 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19788 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19789 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19790
19791 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19792 if (IsStrict) {
19793 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19794 {Op.getOperand(0), InVec});
19795 SDValue Chain = CvtVec.getValue(1);
19796 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19797 DAG.getVectorIdxConstant(0, dl));
19798 return DAG.getMergeValues({Value, Chain}, dl);
19799 }
19800
19801 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19802
19803 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19804 DAG.getVectorIdxConstant(0, dl));
19805}
19806
19807// Try to use a packed vector operation to handle i64 on 32-bit targets.
19809 const X86Subtarget &Subtarget) {
19810 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19811 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19812 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19813 Op.getOpcode() == ISD::UINT_TO_FP) &&
19814 "Unexpected opcode!");
19815 bool IsStrict = Op->isStrictFPOpcode();
19816 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19817 MVT SrcVT = Src.getSimpleValueType();
19818 MVT VT = Op.getSimpleValueType();
19819
19820 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19821 return SDValue();
19822
19823 // Pack the i64 into a vector, do the operation and extract.
19824
19825 assert(Subtarget.hasFP16() && "Expected FP16");
19826
19827 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19828 if (IsStrict) {
19829 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19830 {Op.getOperand(0), InVec});
19831 SDValue Chain = CvtVec.getValue(1);
19832 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19833 DAG.getVectorIdxConstant(0, dl));
19834 return DAG.getMergeValues({Value, Chain}, dl);
19835 }
19836
19837 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19838
19839 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19840 DAG.getVectorIdxConstant(0, dl));
19841}
19842
19843static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19844 const X86Subtarget &Subtarget) {
19845 switch (Opcode) {
19846 case ISD::SINT_TO_FP:
19847 // TODO: Handle wider types with AVX/AVX512.
19848 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19849 return false;
19850 // CVTDQ2PS or (V)CVTDQ2PD
19851 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19852
19853 case ISD::UINT_TO_FP:
19854 // TODO: Handle wider types and i64 elements.
19855 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19856 return false;
19857 // VCVTUDQ2PS or VCVTUDQ2PD
19858 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19859
19860 default:
19861 return false;
19862 }
19863}
19864
19865/// Given a scalar cast operation that is extracted from a vector, try to
19866/// vectorize the cast op followed by extraction. This will avoid an expensive
19867/// round-trip between XMM and GPR.
19869 SelectionDAG &DAG,
19870 const X86Subtarget &Subtarget) {
19871 // TODO: This could be enhanced to handle smaller integer types by peeking
19872 // through an extend.
19873 SDValue Extract = Cast.getOperand(0);
19874 MVT DestVT = Cast.getSimpleValueType();
19875 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19876 !isa<ConstantSDNode>(Extract.getOperand(1)))
19877 return SDValue();
19878
19879 // See if we have a 128-bit vector cast op for this type of cast.
19880 SDValue VecOp = Extract.getOperand(0);
19881 MVT FromVT = VecOp.getSimpleValueType();
19882 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19883 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19884 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19885 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19886 return SDValue();
19887
19888 // If we are extracting from a non-zero element, first shuffle the source
19889 // vector to allow extracting from element zero.
19890 if (!isNullConstant(Extract.getOperand(1))) {
19891 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19892 Mask[0] = Extract.getConstantOperandVal(1);
19893 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19894 }
19895 // If the source vector is wider than 128-bits, extract the low part. Do not
19896 // create an unnecessarily wide vector cast op.
19897 if (FromVT != Vec128VT)
19898 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19899
19900 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19901 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19902 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19903 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19904 DAG.getVectorIdxConstant(0, DL));
19905}
19906
19907/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19908/// try to vectorize the cast ops. This will avoid an expensive round-trip
19909/// between XMM and GPR.
19910static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19911 SelectionDAG &DAG,
19912 const X86Subtarget &Subtarget) {
19913 // TODO: Allow FP_TO_UINT.
19914 SDValue CastToInt = CastToFP.getOperand(0);
19915 MVT VT = CastToFP.getSimpleValueType();
19916 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19917 return SDValue();
19918
19919 MVT IntVT = CastToInt.getSimpleValueType();
19920 SDValue X = CastToInt.getOperand(0);
19921 MVT SrcVT = X.getSimpleValueType();
19922 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19923 return SDValue();
19924
19925 // See if we have 128-bit vector cast instructions for this type of cast.
19926 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19927 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19928 IntVT != MVT::i32)
19929 return SDValue();
19930
19931 unsigned SrcSize = SrcVT.getSizeInBits();
19932 unsigned IntSize = IntVT.getSizeInBits();
19933 unsigned VTSize = VT.getSizeInBits();
19934 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19935 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19936 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19937
19938 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19939 unsigned ToIntOpcode =
19940 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19941 unsigned ToFPOpcode =
19942 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19943
19944 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19945 //
19946 // We are not defining the high elements (for example, zero them) because
19947 // that could nullify any performance advantage that we hoped to gain from
19948 // this vector op hack. We do not expect any adverse effects (like denorm
19949 // penalties) with cast ops.
19950 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19951 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19952 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19953 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19954 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19955}
19956
19958 SelectionDAG &DAG,
19959 const X86Subtarget &Subtarget) {
19960 bool IsStrict = Op->isStrictFPOpcode();
19961 MVT VT = Op->getSimpleValueType(0);
19962 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19963
19964 if (Subtarget.hasDQI()) {
19965 assert(!Subtarget.hasVLX() && "Unexpected features");
19966
19967 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19968 Src.getSimpleValueType() == MVT::v4i64) &&
19969 "Unsupported custom type");
19970
19971 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19972 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19973 "Unexpected VT!");
19974 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19975
19976 // Need to concat with zero vector for strict fp to avoid spurious
19977 // exceptions.
19978 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19979 : DAG.getUNDEF(MVT::v8i64);
19980 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19981 DAG.getVectorIdxConstant(0, DL));
19982 SDValue Res, Chain;
19983 if (IsStrict) {
19984 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19985 {Op->getOperand(0), Src});
19986 Chain = Res.getValue(1);
19987 } else {
19988 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19989 }
19990
19991 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19992 DAG.getVectorIdxConstant(0, DL));
19993
19994 if (IsStrict)
19995 return DAG.getMergeValues({Res, Chain}, DL);
19996 return Res;
19997 }
19998
19999 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20000 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20001 if (VT != MVT::v4f32 || IsSigned)
20002 return SDValue();
20003
20004 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20005 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20006 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20007 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20008 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20009 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20010 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20011 SmallVector<SDValue, 4> SignCvts(4);
20012 SmallVector<SDValue, 4> Chains(4);
20013 for (int i = 0; i != 4; ++i) {
20014 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20015 DAG.getVectorIdxConstant(i, DL));
20016 if (IsStrict) {
20017 SignCvts[i] =
20018 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20019 {Op.getOperand(0), Elt});
20020 Chains[i] = SignCvts[i].getValue(1);
20021 } else {
20022 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20023 }
20024 }
20025 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20026
20027 SDValue Slow, Chain;
20028 if (IsStrict) {
20029 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20030 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20031 {Chain, SignCvt, SignCvt});
20032 Chain = Slow.getValue(1);
20033 } else {
20034 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20035 }
20036
20037 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20038 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20039
20040 if (IsStrict)
20041 return DAG.getMergeValues({Cvt, Chain}, DL);
20042
20043 return Cvt;
20044}
20045
20047 SelectionDAG &DAG) {
20048 bool IsStrict = Op->isStrictFPOpcode();
20049 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20050 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20051 MVT VT = Op.getSimpleValueType();
20052 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20053
20054 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20055 if (IsStrict)
20056 return DAG.getNode(
20057 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20058 {Chain,
20059 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20060 Rnd});
20061 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20062 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20063}
20064
20065static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20066 const X86Subtarget &Subtarget) {
20067 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20068 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20069 return true;
20070 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20071 return true;
20072 }
20073 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20074 return true;
20075 if (Subtarget.useAVX512Regs()) {
20076 if (VT == MVT::v16i32)
20077 return true;
20078 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20079 return true;
20080 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20081 return true;
20082 }
20083 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20084 (VT == MVT::v2i64 || VT == MVT::v4i64))
20085 return true;
20086 return false;
20087}
20088
20089SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20090 SelectionDAG &DAG) const {
20091 bool IsStrict = Op->isStrictFPOpcode();
20092 unsigned OpNo = IsStrict ? 1 : 0;
20093 SDValue Src = Op.getOperand(OpNo);
20094 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20095 MVT SrcVT = Src.getSimpleValueType();
20096 MVT VT = Op.getSimpleValueType();
20097 SDLoc dl(Op);
20098
20099 if (isSoftF16(VT, Subtarget))
20100 return promoteXINT_TO_FP(Op, dl, DAG);
20101 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20102 return Op;
20103
20104 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20105 return LowerWin64_INT128_TO_FP(Op, DAG);
20106
20107 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20108 return Extract;
20109
20110 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20111 return R;
20112
20113 if (SrcVT.isVector()) {
20114 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20115 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20116 // source for strict FP.
20117 if (IsStrict)
20118 return DAG.getNode(
20119 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20120 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20121 DAG.getUNDEF(SrcVT))});
20122 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20123 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20124 DAG.getUNDEF(SrcVT)));
20125 }
20126 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20127 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20128
20129 return SDValue();
20130 }
20131
20132 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20133 "Unknown SINT_TO_FP to lower!");
20134
20135 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20136
20137 // These are really Legal; return the operand so the caller accepts it as
20138 // Legal.
20139 if (SrcVT == MVT::i32 && UseSSEReg)
20140 return Op;
20141 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20142 return Op;
20143
20144 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20145 return V;
20146 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20147 return V;
20148
20149 // SSE doesn't have an i16 conversion so we need to promote.
20150 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20151 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20152 if (IsStrict)
20153 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20154 {Chain, Ext});
20155
20156 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20157 }
20158
20159 if (VT == MVT::f128 || !Subtarget.hasX87())
20160 return SDValue();
20161
20162 SDValue ValueToStore = Src;
20163 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20164 // Bitcasting to f64 here allows us to do a single 64-bit store from
20165 // an SSE register, avoiding the store forwarding penalty that would come
20166 // with two 32-bit stores.
20167 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20168
20169 unsigned Size = SrcVT.getStoreSize();
20170 Align Alignment(Size);
20171 MachineFunction &MF = DAG.getMachineFunction();
20172 auto PtrVT = getPointerTy(MF.getDataLayout());
20173 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20174 MachinePointerInfo MPI =
20176 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20177 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20178 std::pair<SDValue, SDValue> Tmp =
20179 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20180
20181 if (IsStrict)
20182 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20183
20184 return Tmp.first;
20185}
20186
20187std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20188 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20189 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20190 // Build the FILD
20191 SDVTList Tys;
20192 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20193 if (useSSE)
20194 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20195 else
20196 Tys = DAG.getVTList(DstVT, MVT::Other);
20197
20198 SDValue FILDOps[] = {Chain, Pointer};
20199 SDValue Result =
20200 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20201 Alignment, MachineMemOperand::MOLoad);
20202 Chain = Result.getValue(1);
20203
20204 if (useSSE) {
20206 unsigned SSFISize = DstVT.getStoreSize();
20207 int SSFI =
20208 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20209 auto PtrVT = getPointerTy(MF.getDataLayout());
20210 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20211 Tys = DAG.getVTList(MVT::Other);
20212 SDValue FSTOps[] = {Chain, Result, StackSlot};
20215 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20216
20217 Chain =
20218 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20219 Result = DAG.getLoad(
20220 DstVT, DL, Chain, StackSlot,
20222 Chain = Result.getValue(1);
20223 }
20224
20225 return { Result, Chain };
20226}
20227
20228/// Horizontal vector math instructions may be slower than normal math with
20229/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20230/// implementation, and likely shuffle complexity of the alternate sequence.
20231static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20232 const X86Subtarget &Subtarget) {
20233 bool IsOptimizingSize = DAG.shouldOptForSize();
20234 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20235 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20236}
20237
20238/// 64-bit unsigned integer to double expansion.
20240 SelectionDAG &DAG,
20241 const X86Subtarget &Subtarget) {
20242 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20243 // when converting 0 when rounding toward negative infinity. Caller will
20244 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20245 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20246 // This algorithm is not obvious. Here it is what we're trying to output:
20247 /*
20248 movq %rax, %xmm0
20249 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20250 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20251 #ifdef __SSE3__
20252 haddpd %xmm0, %xmm0
20253 #else
20254 pshufd $0x4e, %xmm0, %xmm1
20255 addpd %xmm1, %xmm0
20256 #endif
20257 */
20258
20259 LLVMContext *Context = DAG.getContext();
20260
20261 // Build some magic constants.
20262 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20263 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20264 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20265 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20266
20268 CV1.push_back(
20269 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20270 APInt(64, 0x4330000000000000ULL))));
20271 CV1.push_back(
20272 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20273 APInt(64, 0x4530000000000000ULL))));
20274 Constant *C1 = ConstantVector::get(CV1);
20275 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20276
20277 // Load the 64-bit value into an XMM register.
20278 SDValue XR1 =
20279 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20280 SDValue CLod0 = DAG.getLoad(
20281 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20283 SDValue Unpck1 =
20284 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20285
20286 SDValue CLod1 = DAG.getLoad(
20287 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20289 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20290 // TODO: Are there any fast-math-flags to propagate here?
20291 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20292 SDValue Result;
20293
20294 if (Subtarget.hasSSE3() &&
20295 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20296 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20297 } else {
20298 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20299 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20300 }
20301 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20302 DAG.getVectorIdxConstant(0, dl));
20303 return Result;
20304}
20305
20306/// 32-bit unsigned integer to float expansion.
20308 SelectionDAG &DAG,
20309 const X86Subtarget &Subtarget) {
20310 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20311 // FP constant to bias correct the final result.
20312 SDValue Bias = DAG.getConstantFP(
20313 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20314
20315 // Load the 32-bit value into an XMM register.
20316 SDValue Load =
20317 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20318
20319 // Zero out the upper parts of the register.
20320 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20321
20322 // Or the load with the bias.
20323 SDValue Or = DAG.getNode(
20324 ISD::OR, dl, MVT::v2i64,
20325 DAG.getBitcast(MVT::v2i64, Load),
20326 DAG.getBitcast(MVT::v2i64,
20327 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20328 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20329 DAG.getBitcast(MVT::v2f64, Or),
20330 DAG.getVectorIdxConstant(0, dl));
20331
20332 if (Op.getNode()->isStrictFPOpcode()) {
20333 // Subtract the bias.
20334 // TODO: Are there any fast-math-flags to propagate here?
20335 SDValue Chain = Op.getOperand(0);
20336 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20337 {Chain, Or, Bias});
20338
20339 if (Op.getValueType() == Sub.getValueType())
20340 return Sub;
20341
20342 // Handle final rounding.
20343 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20344 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20345
20346 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20347 }
20348
20349 // Subtract the bias.
20350 // TODO: Are there any fast-math-flags to propagate here?
20351 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20352
20353 // Handle final rounding.
20354 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20355}
20356
20358 SelectionDAG &DAG,
20359 const X86Subtarget &Subtarget) {
20360 if (Op.getSimpleValueType() != MVT::v2f64)
20361 return SDValue();
20362
20363 bool IsStrict = Op->isStrictFPOpcode();
20364
20365 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20366 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20367
20368 if (Subtarget.hasAVX512()) {
20369 if (!Subtarget.hasVLX()) {
20370 // Let generic type legalization widen this.
20371 if (!IsStrict)
20372 return SDValue();
20373 // Otherwise pad the integer input with 0s and widen the operation.
20374 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20375 DAG.getConstant(0, DL, MVT::v2i32));
20376 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20377 {Op.getOperand(0), N0});
20378 SDValue Chain = Res.getValue(1);
20379 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20380 DAG.getVectorIdxConstant(0, DL));
20381 return DAG.getMergeValues({Res, Chain}, DL);
20382 }
20383
20384 // Legalize to v4i32 type.
20385 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20386 DAG.getUNDEF(MVT::v2i32));
20387 if (IsStrict)
20388 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20389 {Op.getOperand(0), N0});
20390 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20391 }
20392
20393 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20394 // This gives us the floating point equivalent of 2^52 + the i32 integer
20395 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20396 // point leaving just our i32 integers in double format.
20397 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20398 SDValue VBias = DAG.getConstantFP(
20399 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20400 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20401 DAG.getBitcast(MVT::v2i64, VBias));
20402 Or = DAG.getBitcast(MVT::v2f64, Or);
20403
20404 if (IsStrict)
20405 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20406 {Op.getOperand(0), Or, VBias});
20407 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20408}
20409
20411 SelectionDAG &DAG,
20412 const X86Subtarget &Subtarget) {
20413 bool IsStrict = Op->isStrictFPOpcode();
20414 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20415 MVT VecIntVT = V.getSimpleValueType();
20416 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20417 "Unsupported custom type");
20418
20419 if (Subtarget.hasAVX512()) {
20420 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20421 assert(!Subtarget.hasVLX() && "Unexpected features");
20422 MVT VT = Op->getSimpleValueType(0);
20423
20424 // v8i32->v8f64 is legal with AVX512 so just return it.
20425 if (VT == MVT::v8f64)
20426 return Op;
20427
20428 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20429 VT == MVT::v8f16) &&
20430 "Unexpected VT!");
20431 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20432 MVT WideIntVT = MVT::v16i32;
20433 if (VT == MVT::v4f64) {
20434 WideVT = MVT::v8f64;
20435 WideIntVT = MVT::v8i32;
20436 }
20437
20438 // Need to concat with zero vector for strict fp to avoid spurious
20439 // exceptions.
20440 SDValue Tmp =
20441 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20442 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20443 DAG.getVectorIdxConstant(0, DL));
20444 SDValue Res, Chain;
20445 if (IsStrict) {
20446 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20447 {Op->getOperand(0), V});
20448 Chain = Res.getValue(1);
20449 } else {
20450 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20451 }
20452
20453 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20454 DAG.getVectorIdxConstant(0, DL));
20455
20456 if (IsStrict)
20457 return DAG.getMergeValues({Res, Chain}, DL);
20458 return Res;
20459 }
20460
20461 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20462 Op->getSimpleValueType(0) == MVT::v4f64) {
20463 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20464 Constant *Bias = ConstantFP::get(
20465 *DAG.getContext(),
20466 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20467 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20468 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20469 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20470 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20471 SDValue VBias = DAG.getMemIntrinsicNode(
20472 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20475
20476 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20477 DAG.getBitcast(MVT::v4i64, VBias));
20478 Or = DAG.getBitcast(MVT::v4f64, Or);
20479
20480 if (IsStrict)
20481 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20482 {Op.getOperand(0), Or, VBias});
20483 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20484 }
20485
20486 // The algorithm is the following:
20487 // #ifdef __SSE4_1__
20488 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20489 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20490 // (uint4) 0x53000000, 0xaa);
20491 // #else
20492 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20493 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20494 // #endif
20495 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20496 // return (float4) lo + fhi;
20497
20498 bool Is128 = VecIntVT == MVT::v4i32;
20499 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20500 // If we convert to something else than the supported type, e.g., to v4f64,
20501 // abort early.
20502 if (VecFloatVT != Op->getSimpleValueType(0))
20503 return SDValue();
20504
20505 // In the #idef/#else code, we have in common:
20506 // - The vector of constants:
20507 // -- 0x4b000000
20508 // -- 0x53000000
20509 // - A shift:
20510 // -- v >> 16
20511
20512 // Create the splat vector for 0x4b000000.
20513 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20514 // Create the splat vector for 0x53000000.
20515 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20516
20517 // Create the right shift.
20518 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20519 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20520
20521 SDValue Low, High;
20522 if (Subtarget.hasSSE41()) {
20523 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20524 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20525 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20526 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20527 // Low will be bitcasted right away, so do not bother bitcasting back to its
20528 // original type.
20529 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20530 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20531 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20532 // (uint4) 0x53000000, 0xaa);
20533 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20534 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20535 // High will be bitcasted right away, so do not bother bitcasting back to
20536 // its original type.
20537 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20538 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20539 } else {
20540 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20541 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20542 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20543 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20544
20545 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20546 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20547 }
20548
20549 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20550 SDValue VecCstFSub = DAG.getConstantFP(
20551 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20552
20553 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20554 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20555 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20556 // enabled. See PR24512.
20557 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20558 // TODO: Are there any fast-math-flags to propagate here?
20559 // (float4) lo;
20560 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20561 // return (float4) lo + fhi;
20562 if (IsStrict) {
20563 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20564 {Op.getOperand(0), HighBitcast, VecCstFSub});
20565 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20566 {FHigh.getValue(1), LowBitcast, FHigh});
20567 }
20568
20569 SDValue FHigh =
20570 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20571 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20572}
20573
20575 const X86Subtarget &Subtarget) {
20576 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20577 SDValue N0 = Op.getOperand(OpNo);
20578 MVT SrcVT = N0.getSimpleValueType();
20579
20580 switch (SrcVT.SimpleTy) {
20581 default:
20582 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20583 case MVT::v2i32:
20584 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20585 case MVT::v4i32:
20586 case MVT::v8i32:
20587 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20588 case MVT::v2i64:
20589 case MVT::v4i64:
20590 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20591 }
20592}
20593
20594SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20595 SelectionDAG &DAG) const {
20596 bool IsStrict = Op->isStrictFPOpcode();
20597 unsigned OpNo = IsStrict ? 1 : 0;
20598 SDValue Src = Op.getOperand(OpNo);
20599 SDLoc dl(Op);
20600 auto PtrVT = getPointerTy(DAG.getDataLayout());
20601 MVT SrcVT = Src.getSimpleValueType();
20602 MVT DstVT = Op->getSimpleValueType(0);
20603 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20604
20605 // Bail out when we don't have native conversion instructions.
20606 if (DstVT == MVT::f128)
20607 return SDValue();
20608
20609 if (isSoftF16(DstVT, Subtarget))
20610 return promoteXINT_TO_FP(Op, dl, DAG);
20611 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20612 return Op;
20613
20614 if (DstVT.isVector())
20615 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20616
20617 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20618 return LowerWin64_INT128_TO_FP(Op, DAG);
20619
20620 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20621 return Extract;
20622
20623 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20624 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20625 // Conversions from unsigned i32 to f32/f64 are legal,
20626 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20627 return Op;
20628 }
20629
20630 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20631 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20632 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20633 if (IsStrict)
20634 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20635 {Chain, Src});
20636 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20637 }
20638
20639 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20640 return V;
20641 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20642 return V;
20643
20644 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20645 // infinity. It produces -0.0, so disable under strictfp.
20646 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20647 !IsStrict)
20648 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20649 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20650 // negative infinity. So disable under strictfp. Using FILD instead.
20651 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20652 !IsStrict)
20653 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20654 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20655 (DstVT == MVT::f32 || DstVT == MVT::f64))
20656 return SDValue();
20657
20658 // Make a 64-bit buffer, and use it to build an FILD.
20659 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20660 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20661 Align SlotAlign(8);
20662 MachinePointerInfo MPI =
20664 if (SrcVT == MVT::i32) {
20665 SDValue OffsetSlot =
20666 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20667 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20668 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20669 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20670 std::pair<SDValue, SDValue> Tmp =
20671 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20672 if (IsStrict)
20673 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20674
20675 return Tmp.first;
20676 }
20677
20678 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20679 SDValue ValueToStore = Src;
20680 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20681 // Bitcasting to f64 here allows us to do a single 64-bit store from
20682 // an SSE register, avoiding the store forwarding penalty that would come
20683 // with two 32-bit stores.
20684 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20685 }
20686 SDValue Store =
20687 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20688 // For i64 source, we need to add the appropriate power of 2 if the input
20689 // was negative. We must be careful to do the computation in x87 extended
20690 // precision, not in SSE.
20691 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20692 SDValue Ops[] = {Store, StackSlot};
20693 SDValue Fild =
20694 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20695 SlotAlign, MachineMemOperand::MOLoad);
20696 Chain = Fild.getValue(1);
20697
20698 // Check whether the sign bit is set.
20699 SDValue SignSet = DAG.getSetCC(
20700 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20701 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20702
20703 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20704 APInt FF(64, 0x5F80000000000000ULL);
20705 SDValue FudgePtr =
20706 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20707 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20708
20709 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20710 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20711 SDValue Four = DAG.getIntPtrConstant(4, dl);
20712 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20713 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20714
20715 // Load the value out, extending it from f32 to f80.
20716 SDValue Fudge = DAG.getExtLoad(
20717 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20719 CPAlignment);
20720 Chain = Fudge.getValue(1);
20721 // Extend everything to 80 bits to force it to be done on x87.
20722 // TODO: Are there any fast-math-flags to propagate here?
20723 if (IsStrict) {
20724 unsigned Opc = ISD::STRICT_FADD;
20725 // Windows needs the precision control changed to 80bits around this add.
20726 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20728
20729 SDValue Add =
20730 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20731 // STRICT_FP_ROUND can't handle equal types.
20732 if (DstVT == MVT::f80)
20733 return Add;
20734 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20735 {Add.getValue(1), Add,
20736 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20737 }
20738 unsigned Opc = ISD::FADD;
20739 // Windows needs the precision control changed to 80bits around this add.
20740 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20742
20743 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20744 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20745 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20746}
20747
20748// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20749// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20750// just return an SDValue().
20751// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20752// to i16, i32 or i64, and we lower it to a legal sequence and return the
20753// result.
20754SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20755 bool IsSigned,
20756 SDValue &Chain) const {
20757 bool IsStrict = Op->isStrictFPOpcode();
20758 SDLoc DL(Op);
20759
20760 EVT DstTy = Op.getValueType();
20761 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20762 EVT TheVT = Value.getValueType();
20763 auto PtrVT = getPointerTy(DAG.getDataLayout());
20764
20765 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20766 // f16 must be promoted before using the lowering in this routine.
20767 // fp128 does not use this lowering.
20768 return SDValue();
20769 }
20770
20771 // If using FIST to compute an unsigned i64, we'll need some fixup
20772 // to handle values above the maximum signed i64. A FIST is always
20773 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20774 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20775
20776 // FIXME: This does not generate an invalid exception if the input does not
20777 // fit in i32. PR44019
20778 if (!IsSigned && DstTy != MVT::i64) {
20779 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20780 // The low 32 bits of the fist result will have the correct uint32 result.
20781 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20782 DstTy = MVT::i64;
20783 }
20784
20785 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20786 DstTy.getSimpleVT() >= MVT::i16 &&
20787 "Unknown FP_TO_INT to lower!");
20788
20789 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20790 // stack slot.
20791 MachineFunction &MF = DAG.getMachineFunction();
20792 unsigned MemSize = DstTy.getStoreSize();
20793 int SSFI =
20794 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20795 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20796
20797 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20798
20799 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20800
20801 if (UnsignedFixup) {
20802 //
20803 // Conversion to unsigned i64 is implemented with a select,
20804 // depending on whether the source value fits in the range
20805 // of a signed i64. Let Thresh be the FP equivalent of
20806 // 0x8000000000000000ULL.
20807 //
20808 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20809 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20810 // FistSrc = (Value - FltOfs);
20811 // Fist-to-mem64 FistSrc
20812 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20813 // to XOR'ing the high 32 bits with Adjust.
20814 //
20815 // Being a power of 2, Thresh is exactly representable in all FP formats.
20816 // For X87 we'd like to use the smallest FP type for this constant, but
20817 // for DAG type consistency we have to match the FP operand type.
20818
20819 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20821 bool LosesInfo = false;
20822 if (TheVT == MVT::f64)
20823 // The rounding mode is irrelevant as the conversion should be exact.
20824 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20825 &LosesInfo);
20826 else if (TheVT == MVT::f80)
20827 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20828 APFloat::rmNearestTiesToEven, &LosesInfo);
20829
20830 assert(Status == APFloat::opOK && !LosesInfo &&
20831 "FP conversion should have been exact");
20832
20833 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20834
20835 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20836 *DAG.getContext(), TheVT);
20837 SDValue Cmp;
20838 if (IsStrict) {
20839 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20840 /*IsSignaling*/ true);
20841 Chain = Cmp.getValue(1);
20842 } else {
20843 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20844 }
20845
20846 // Our preferred lowering of
20847 //
20848 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20849 //
20850 // is
20851 //
20852 // (Value >= Thresh) << 63
20853 //
20854 // but since we can get here after LegalOperations, DAGCombine might do the
20855 // wrong thing if we create a select. So, directly create the preferred
20856 // version.
20857 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20858 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20859 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20860
20861 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20862 DAG.getConstantFP(0.0, DL, TheVT));
20863
20864 if (IsStrict) {
20865 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20866 { Chain, Value, FltOfs });
20867 Chain = Value.getValue(1);
20868 } else
20869 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20870 }
20871
20872 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20873
20874 // FIXME This causes a redundant load/store if the SSE-class value is already
20875 // in memory, such as if it is on the callstack.
20876 if (isScalarFPTypeInSSEReg(TheVT)) {
20877 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20878 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20879 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20880 SDValue Ops[] = { Chain, StackSlot };
20881
20882 unsigned FLDSize = TheVT.getStoreSize();
20883 assert(FLDSize <= MemSize && "Stack slot not big enough");
20884 MachineMemOperand *MMO = MF.getMachineMemOperand(
20885 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20886 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20887 Chain = Value.getValue(1);
20888 }
20889
20890 // Build the FP_TO_INT*_IN_MEM
20891 MachineMemOperand *MMO = MF.getMachineMemOperand(
20892 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20893 SDValue Ops[] = { Chain, Value, StackSlot };
20895 DAG.getVTList(MVT::Other),
20896 Ops, DstTy, MMO);
20897
20898 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20899 Chain = Res.getValue(1);
20900
20901 // If we need an unsigned fixup, XOR the result with adjust.
20902 if (UnsignedFixup)
20903 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20904
20905 return Res;
20906}
20907
20909 const X86Subtarget &Subtarget) {
20910 MVT VT = Op.getSimpleValueType();
20911 SDValue In = Op.getOperand(0);
20912 MVT InVT = In.getSimpleValueType();
20913 unsigned Opc = Op.getOpcode();
20914
20915 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20917 "Unexpected extension opcode");
20919 "Expected same number of elements");
20920 assert((VT.getVectorElementType() == MVT::i16 ||
20921 VT.getVectorElementType() == MVT::i32 ||
20922 VT.getVectorElementType() == MVT::i64) &&
20923 "Unexpected element type");
20924 assert((InVT.getVectorElementType() == MVT::i8 ||
20925 InVT.getVectorElementType() == MVT::i16 ||
20926 InVT.getVectorElementType() == MVT::i32) &&
20927 "Unexpected element type");
20928
20929 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20930
20931 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20932 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20933 return splitVectorIntUnary(Op, DAG, dl);
20934 }
20935
20936 if (Subtarget.hasInt256())
20937 return Op;
20938
20939 // Optimize vectors in AVX mode:
20940 //
20941 // v8i16 -> v8i32
20942 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20943 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20944 // Concat upper and lower parts.
20945 //
20946 // v4i32 -> v4i64
20947 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20948 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20949 // Concat upper and lower parts.
20950 //
20951 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20952 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20953
20954 // Short-circuit if we can determine that each 128-bit half is the same value.
20955 // Otherwise, this is difficult to match and optimize.
20956 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20957 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20958 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20959
20960 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20961 SDValue Undef = DAG.getUNDEF(InVT);
20962 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20963 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20964 OpHi = DAG.getBitcast(HalfVT, OpHi);
20965
20966 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20967}
20968
20969// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20970static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20971 const SDLoc &dl, SelectionDAG &DAG) {
20972 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20973 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20974 DAG.getVectorIdxConstant(0, dl));
20975 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20976 DAG.getVectorIdxConstant(8, dl));
20977 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20978 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20979 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20980 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20981}
20982
20984 const X86Subtarget &Subtarget,
20985 SelectionDAG &DAG) {
20986 MVT VT = Op->getSimpleValueType(0);
20987 SDValue In = Op->getOperand(0);
20988 MVT InVT = In.getSimpleValueType();
20989 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20990 unsigned NumElts = VT.getVectorNumElements();
20991
20992 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20993 // avoids a constant pool load.
20994 if (VT.getVectorElementType() != MVT::i8) {
20995 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20996 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20997 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20998 }
20999
21000 // Extend VT if BWI is not supported.
21001 MVT ExtVT = VT;
21002 if (!Subtarget.hasBWI()) {
21003 // If v16i32 is to be avoided, we'll need to split and concatenate.
21004 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21005 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21006
21007 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21008 }
21009
21010 // Widen to 512-bits if VLX is not supported.
21011 MVT WideVT = ExtVT;
21012 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21013 NumElts *= 512 / ExtVT.getSizeInBits();
21014 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21015 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21016 DAG.getVectorIdxConstant(0, DL));
21017 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21018 }
21019
21020 SDValue One = DAG.getConstant(1, DL, WideVT);
21021 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21022
21023 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21024
21025 // Truncate if we had to extend above.
21026 if (VT != ExtVT) {
21027 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21028 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21029 }
21030
21031 // Extract back to 128/256-bit if we widened.
21032 if (WideVT != VT)
21033 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21034 DAG.getVectorIdxConstant(0, DL));
21035
21036 return SelectedVal;
21037}
21038
21040 SelectionDAG &DAG) {
21041 SDValue In = Op.getOperand(0);
21042 MVT SVT = In.getSimpleValueType();
21043 SDLoc DL(Op);
21044
21045 if (SVT.getVectorElementType() == MVT::i1)
21046 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21047
21048 assert(Subtarget.hasAVX() && "Expected AVX support");
21049 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21050}
21051
21052/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21053/// It makes use of the fact that vectors with enough leading sign/zero bits
21054/// prevent the PACKSS/PACKUS from saturating the results.
21055/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21056/// within each 128-bit lane.
21057static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21058 const SDLoc &DL, SelectionDAG &DAG,
21059 const X86Subtarget &Subtarget) {
21060 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21061 "Unexpected PACK opcode");
21062 assert(DstVT.isVector() && "VT not a vector?");
21063
21064 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21065 if (!Subtarget.hasSSE2())
21066 return SDValue();
21067
21068 EVT SrcVT = In.getValueType();
21069
21070 // No truncation required, we might get here due to recursive calls.
21071 if (SrcVT == DstVT)
21072 return In;
21073
21074 unsigned NumElems = SrcVT.getVectorNumElements();
21075 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21076 return SDValue();
21077
21078 unsigned DstSizeInBits = DstVT.getSizeInBits();
21079 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21080 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21081 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21082
21083 LLVMContext &Ctx = *DAG.getContext();
21084 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21085 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21086
21087 // Pack to the largest type possible:
21088 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21089 EVT InVT = MVT::i16, OutVT = MVT::i8;
21090 if (SrcVT.getScalarSizeInBits() > 16 &&
21091 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21092 InVT = MVT::i32;
21093 OutVT = MVT::i16;
21094 }
21095
21096 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21097 // On pre-AVX512, pack the src in both halves to help value tracking.
21098 if (SrcSizeInBits <= 128) {
21099 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21100 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21101 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21102 SDValue LHS = DAG.getBitcast(InVT, In);
21103 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21104 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21105 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21106 Res = DAG.getBitcast(PackedVT, Res);
21107 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21108 }
21109
21110 // Split lower/upper subvectors.
21111 SDValue Lo, Hi;
21112 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21113
21114 // If Hi is undef, then don't bother packing it and widen the result instead.
21115 if (Hi.isUndef()) {
21116 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21117 if (SDValue Res =
21118 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21119 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21120 }
21121
21122 unsigned SubSizeInBits = SrcSizeInBits / 2;
21123 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21124 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21125
21126 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21127 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21128 Lo = DAG.getBitcast(InVT, Lo);
21129 Hi = DAG.getBitcast(InVT, Hi);
21130 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21131 return DAG.getBitcast(DstVT, Res);
21132 }
21133
21134 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21135 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21136 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21137 Lo = DAG.getBitcast(InVT, Lo);
21138 Hi = DAG.getBitcast(InVT, Hi);
21139 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21140
21141 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21142 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21143 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21145 int Scale = 64 / OutVT.getScalarSizeInBits();
21146 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21147 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21148
21149 if (DstVT.is256BitVector())
21150 return DAG.getBitcast(DstVT, Res);
21151
21152 // If 512bit -> 128bit truncate another stage.
21153 Res = DAG.getBitcast(PackedVT, Res);
21154 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21155 }
21156
21157 // Recursively pack lower/upper subvectors, concat result and pack again.
21158 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21159
21160 if (PackedVT.is128BitVector()) {
21161 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21162 // type legalization.
21163 SDValue Res =
21164 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21165 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21166 }
21167
21168 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21169 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21170 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21171 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21172 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21173}
21174
21175/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21176/// e.g. trunc <8 x i32> X to <8 x i16> -->
21177/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21178/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21180 const X86Subtarget &Subtarget,
21181 SelectionDAG &DAG) {
21182 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21183 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21184}
21185
21186/// Truncate using inreg sign extension and X86ISD::PACKSS.
21188 const X86Subtarget &Subtarget,
21189 SelectionDAG &DAG) {
21190 EVT SrcVT = In.getValueType();
21191 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21192 DAG.getValueType(DstVT));
21193 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21194}
21195
21196/// Helper to determine if \p In truncated to \p DstVT has the necessary
21197/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21198/// possibly by converting a SRL node to SRA for sign extension.
21199static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21200 SDValue In, const SDLoc &DL,
21201 SelectionDAG &DAG,
21202 const X86Subtarget &Subtarget,
21203 const SDNodeFlags Flags = SDNodeFlags()) {
21204 // Requires SSE2.
21205 if (!Subtarget.hasSSE2())
21206 return SDValue();
21207
21208 EVT SrcVT = In.getValueType();
21209 EVT DstSVT = DstVT.getVectorElementType();
21210 EVT SrcSVT = SrcVT.getVectorElementType();
21211 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21212 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21213
21214 // Check we have a truncation suited for PACKSS/PACKUS.
21215 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21216 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21217 return SDValue();
21218
21219 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21220 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21221
21222 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21223 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21224 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21225 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21226 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21227 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21228 return SDValue();
21229
21230 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21231 // split this for packing.
21232 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21233 !isFreeToSplitVector(In, DAG) &&
21234 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21235 return SDValue();
21236
21237 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21238 if (Subtarget.hasAVX512() && NumStages > 1)
21239 return SDValue();
21240
21241 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21242 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21243
21244 // Truncate with PACKUS if we are truncating a vector with leading zero
21245 // bits that extend all the way to the packed/truncated value.
21246 // e.g. Masks, zext_in_reg, etc.
21247 // Pre-SSE41 we can only use PACKUSWB.
21248 KnownBits Known = DAG.computeKnownBits(In);
21249 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21250 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21251 PackOpcode = X86ISD::PACKUS;
21252 return In;
21253 }
21254
21255 // Truncate with PACKSS if we are truncating a vector with sign-bits
21256 // that extend all the way to the packed/truncated value.
21257 // e.g. Comparison result, sext_in_reg, etc.
21258 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21259
21260 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21261 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21262 // see through BITCASTs later on and combines/simplifications can't then use
21263 // it.
21264 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21265 !Subtarget.hasAVX512())
21266 return SDValue();
21267
21268 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21269 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21270 MinSignBits < NumSignBits) {
21271 PackOpcode = X86ISD::PACKSS;
21272 return In;
21273 }
21274
21275 // If we have a srl that only generates signbits that we will discard in
21276 // the truncation then we can use PACKSS by converting the srl to a sra.
21277 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21278 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21279 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21280 if (*ShAmt == MinSignBits) {
21281 PackOpcode = X86ISD::PACKSS;
21282 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21283 }
21284 }
21285
21286 return SDValue();
21287}
21288
21289/// This function lowers a vector truncation of 'extended sign-bits' or
21290/// 'extended zero-bits' values.
21291/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21293 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21294 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21295 MVT SrcVT = In.getSimpleValueType();
21296 MVT DstSVT = DstVT.getVectorElementType();
21297 MVT SrcSVT = SrcVT.getVectorElementType();
21298 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21299 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21300 return SDValue();
21301
21302 // If the upper half of the source is undef, then attempt to split and
21303 // only truncate the lower half.
21304 if (DstVT.getSizeInBits() >= 128) {
21305 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21306 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21307 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21308 Subtarget, DAG))
21309 return widenSubVector(Res, false, Subtarget, DAG, DL,
21310 DstVT.getSizeInBits());
21311 }
21312 }
21313
21314 unsigned PackOpcode;
21315 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21316 Subtarget, Flags))
21317 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21318
21319 return SDValue();
21320}
21321
21322/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21323/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21325 const X86Subtarget &Subtarget,
21326 SelectionDAG &DAG) {
21327 MVT SrcVT = In.getSimpleValueType();
21328 MVT DstSVT = DstVT.getVectorElementType();
21329 MVT SrcSVT = SrcVT.getVectorElementType();
21330 unsigned NumElems = DstVT.getVectorNumElements();
21331 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21332 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21333 NumElems >= 8))
21334 return SDValue();
21335
21336 // SSSE3's pshufb results in less instructions in the cases below.
21337 if (Subtarget.hasSSSE3() && NumElems == 8) {
21338 if (SrcSVT == MVT::i16)
21339 return SDValue();
21340 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21341 return SDValue();
21342 }
21343
21344 // If the upper half of the source is undef, then attempt to split and
21345 // only truncate the lower half.
21346 if (DstVT.getSizeInBits() >= 128) {
21347 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21348 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21349 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21350 return widenSubVector(Res, false, Subtarget, DAG, DL,
21351 DstVT.getSizeInBits());
21352 }
21353 }
21354
21355 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21356 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21357 // truncate 2 x v4i32 to v8i16.
21358 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21359 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21360
21361 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21362 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21363
21364 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21365 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21366 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21367 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21368 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21369 }
21370
21371 return SDValue();
21372}
21373
21375 SelectionDAG &DAG,
21376 const X86Subtarget &Subtarget) {
21377 MVT VT = Op.getSimpleValueType();
21378 SDValue In = Op.getOperand(0);
21379 MVT InVT = In.getSimpleValueType();
21380 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21381
21382 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21383 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21384 if (InVT.getScalarSizeInBits() <= 16) {
21385 if (Subtarget.hasBWI()) {
21386 // legal, will go to VPMOVB2M, VPMOVW2M
21387 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21388 // We need to shift to get the lsb into sign position.
21389 // Shift packed bytes not supported natively, bitcast to word
21390 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21391 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21392 DAG.getBitcast(ExtVT, In),
21393 DAG.getConstant(ShiftInx, DL, ExtVT));
21394 In = DAG.getBitcast(InVT, In);
21395 }
21396 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21397 In, ISD::SETGT);
21398 }
21399 // Use TESTD/Q, extended vector to packed dword/qword.
21400 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21401 "Unexpected vector type.");
21402 unsigned NumElts = InVT.getVectorNumElements();
21403 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21404 // We need to change to a wider element type that we have support for.
21405 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21406 // For 16 element vectors we extend to v16i32 unless we are explicitly
21407 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21408 // we need to split into two 8 element vectors which we can extend to v8i32,
21409 // truncate and concat the results. There's an additional complication if
21410 // the original type is v16i8. In that case we can't split the v16i8
21411 // directly, so we need to shuffle high elements to low and use
21412 // sign_extend_vector_inreg.
21413 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21414 SDValue Lo, Hi;
21415 if (InVT == MVT::v16i8) {
21416 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21417 Hi = DAG.getVectorShuffle(
21418 InVT, DL, In, In,
21419 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21420 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21421 } else {
21422 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21423 Lo = extract128BitVector(In, 0, DAG, DL);
21424 Hi = extract128BitVector(In, 8, DAG, DL);
21425 }
21426 // We're split now, just emit two truncates and a concat. The two
21427 // truncates will trigger legalization to come back to this function.
21428 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21429 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21430 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21431 }
21432 // We either have 8 elements or we're allowed to use 512-bit vectors.
21433 // If we have VLX, we want to use the narrowest vector that can get the
21434 // job done so we use vXi32.
21435 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21436 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21437 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21438 InVT = ExtVT;
21439 ShiftInx = InVT.getScalarSizeInBits() - 1;
21440 }
21441
21442 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21443 // We need to shift to get the lsb into sign position.
21444 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21445 DAG.getConstant(ShiftInx, DL, InVT));
21446 }
21447 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21448 if (Subtarget.hasDQI())
21449 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21450 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21451}
21452
21453SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21454 SDLoc DL(Op);
21455 MVT VT = Op.getSimpleValueType();
21456 SDValue In = Op.getOperand(0);
21457 MVT InVT = In.getSimpleValueType();
21459 "Invalid TRUNCATE operation");
21460
21461 // If we're called by the type legalizer, handle a few cases.
21462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21463 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21464 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21465 VT.is128BitVector() && Subtarget.hasAVX512()) {
21466 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21467 "Unexpected subtarget!");
21468 // The default behavior is to truncate one step, concatenate, and then
21469 // truncate the remainder. We'd rather produce two 64-bit results and
21470 // concatenate those.
21471 SDValue Lo, Hi;
21472 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21473
21474 EVT LoVT, HiVT;
21475 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21476
21477 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21478 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21479 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21480 }
21481
21482 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21483 if (!Subtarget.hasAVX512() ||
21484 (InVT.is512BitVector() && VT.is256BitVector()))
21486 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21487 return SignPack;
21488
21489 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21490 if (!Subtarget.hasAVX512())
21491 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21492
21493 // Otherwise let default legalization handle it.
21494 return SDValue();
21495 }
21496
21497 if (VT.getVectorElementType() == MVT::i1)
21498 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21499
21500 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21501 // concat from subvectors to use VPTRUNC etc.
21502 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21504 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21505 return SignPack;
21506
21507 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21508 if (Subtarget.hasAVX512()) {
21509 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21510 assert(VT == MVT::v32i8 && "Unexpected VT!");
21511 return splitVectorIntUnary(Op, DAG, DL);
21512 }
21513
21514 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21515 // and then truncate that. But we should only do that if we haven't been
21516 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21517 // handled by isel patterns.
21518 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21519 Subtarget.canExtendTo512DQ())
21520 return Op;
21521 }
21522
21523 // Handle truncation of V256 to V128 using shuffles.
21524 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21525
21526 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21527 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21528 if (Subtarget.hasInt256()) {
21529 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21530 In = DAG.getBitcast(MVT::v8i32, In);
21531 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21532 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21533 DAG.getVectorIdxConstant(0, DL));
21534 }
21535
21536 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21537 DAG.getVectorIdxConstant(0, DL));
21538 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21539 DAG.getVectorIdxConstant(2, DL));
21540 static const int ShufMask[] = {0, 2, 4, 6};
21541 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21542 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21543 }
21544
21545 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21546 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21547 if (Subtarget.hasInt256()) {
21548 // The PSHUFB mask:
21549 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21550 -1, -1, -1, -1, -1, -1, -1, -1,
21551 16, 17, 20, 21, 24, 25, 28, 29,
21552 -1, -1, -1, -1, -1, -1, -1, -1 };
21553 In = DAG.getBitcast(MVT::v32i8, In);
21554 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21555 In = DAG.getBitcast(MVT::v4i64, In);
21556
21557 static const int ShufMask2[] = {0, 2, -1, -1};
21558 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21559 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21560 DAG.getVectorIdxConstant(0, DL));
21561 return DAG.getBitcast(MVT::v8i16, In);
21562 }
21563
21564 return Subtarget.hasSSE41()
21565 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21566 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21567 }
21568
21569 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21570 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21571
21572 llvm_unreachable("All 256->128 cases should have been handled above!");
21573}
21574
21575// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21576// behaves on out of range inputs to generate optimized conversions.
21578 SelectionDAG &DAG,
21579 const X86Subtarget &Subtarget) {
21580 MVT SrcVT = Src.getSimpleValueType();
21581 unsigned DstBits = VT.getScalarSizeInBits();
21582 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21583
21584 // Calculate the converted result for values in the range 0 to
21585 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21586 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21587 SDValue Big =
21588 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21589 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21590 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21591
21592 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21593 // and only if the value was out of range. So we can use that
21594 // as our indicator that we rather use "Big" instead of "Small".
21595 //
21596 // Use "Small" if "IsOverflown" has all bits cleared
21597 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21598
21599 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21600 // use the slightly slower blendv select instead.
21601 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21602 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21603 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21604 }
21605
21606 SDValue IsOverflown =
21607 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21608 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21609 return DAG.getNode(ISD::OR, dl, VT, Small,
21610 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21611}
21612
21613SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21614 bool IsStrict = Op->isStrictFPOpcode();
21615 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21616 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21617 bool HasVLX = Subtarget.hasVLX();
21618 MVT VT = Op->getSimpleValueType(0);
21619 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21620 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21621 MVT SrcVT = Src.getSimpleValueType();
21622 SDLoc dl(Op);
21623
21624 SDValue Res;
21625 if (isSoftF16(SrcVT, Subtarget)) {
21626 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21627 if (IsStrict)
21628 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21629 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21630 {NVT, MVT::Other}, {Chain, Src})});
21631 return DAG.getNode(Op.getOpcode(), dl, VT,
21632 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21633 } else if (isTypeLegal(SrcVT) &&
21634 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21635 return Op;
21636 }
21637
21638 if (VT.isVector()) {
21639 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21640 MVT ResVT = MVT::v4i32;
21641 MVT TruncVT = MVT::v4i1;
21642 unsigned Opc;
21643 if (IsStrict)
21645 else
21646 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21647
21648 if (!IsSigned && !HasVLX) {
21649 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21650 // Widen to 512-bits.
21651 ResVT = MVT::v8i32;
21652 TruncVT = MVT::v8i1;
21653 Opc = Op.getOpcode();
21654 // Need to concat with zero vector for strict fp to avoid spurious
21655 // exceptions.
21656 // TODO: Should we just do this for non-strict as well?
21657 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21658 : DAG.getUNDEF(MVT::v8f64);
21659 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21660 DAG.getVectorIdxConstant(0, dl));
21661 }
21662 if (IsStrict) {
21663 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21664 Chain = Res.getValue(1);
21665 } else {
21666 Res = DAG.getNode(Opc, dl, ResVT, Src);
21667 }
21668
21669 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21670 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21671 DAG.getVectorIdxConstant(0, dl));
21672 if (IsStrict)
21673 return DAG.getMergeValues({Res, Chain}, dl);
21674 return Res;
21675 }
21676
21677 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21678 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21679 VT == MVT::v32i16)
21680 return Op;
21681
21682 MVT ResVT = VT;
21683 MVT EleVT = VT.getVectorElementType();
21684 if (EleVT != MVT::i64)
21685 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21686
21687 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21688 SDValue Tmp =
21689 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21690 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21691 Ops[0] = Src;
21692 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21693 }
21694
21695 if (!HasVLX) {
21696 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21697 // Widen to 512-bits.
21698 unsigned IntSize = EleVT.getSizeInBits();
21699 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21700 ResVT = MVT::getVectorVT(EleVT, Num);
21701 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21702 Subtarget, DAG, dl);
21703 }
21704
21705 if (IsStrict) {
21706 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21708 dl, {ResVT, MVT::Other}, {Chain, Src});
21709 Chain = Res.getValue(1);
21710 } else {
21711 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21712 ResVT, Src);
21713 }
21714
21715 // TODO: Need to add exception check code for strict FP.
21716 if (EleVT.getSizeInBits() < 16) {
21717 if (HasVLX)
21718 ResVT = MVT::getVectorVT(EleVT, 8);
21719 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21720 }
21721
21722 if (ResVT != VT)
21723 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21724 DAG.getVectorIdxConstant(0, dl));
21725
21726 if (IsStrict)
21727 return DAG.getMergeValues({Res, Chain}, dl);
21728 return Res;
21729 }
21730
21731 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21732 if (VT.getVectorElementType() == MVT::i16) {
21733 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21734 SrcVT.getVectorElementType() == MVT::f64) &&
21735 "Expected f32/f64 vector!");
21736 MVT NVT = VT.changeVectorElementType(MVT::i32);
21737 if (IsStrict) {
21738 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21740 dl, {NVT, MVT::Other}, {Chain, Src});
21741 Chain = Res.getValue(1);
21742 } else {
21743 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21744 NVT, Src);
21745 }
21746
21747 // TODO: Need to add exception check code for strict FP.
21748 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21749
21750 if (IsStrict)
21751 return DAG.getMergeValues({Res, Chain}, dl);
21752 return Res;
21753 }
21754
21755 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21756 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21757 assert(!IsSigned && "Expected unsigned conversion!");
21758 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21759 return Op;
21760 }
21761
21762 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21763 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21764 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21765 Subtarget.useAVX512Regs()) {
21766 assert(!IsSigned && "Expected unsigned conversion!");
21767 assert(!Subtarget.hasVLX() && "Unexpected features!");
21768 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21769 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21770 // Need to concat with zero vector for strict fp to avoid spurious
21771 // exceptions.
21772 // TODO: Should we just do this for non-strict as well?
21773 SDValue Tmp =
21774 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21775 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21776 DAG.getVectorIdxConstant(0, dl));
21777
21778 if (IsStrict) {
21779 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21780 {Chain, Src});
21781 Chain = Res.getValue(1);
21782 } else {
21783 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21784 }
21785
21786 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21787 DAG.getVectorIdxConstant(0, dl));
21788
21789 if (IsStrict)
21790 return DAG.getMergeValues({Res, Chain}, dl);
21791 return Res;
21792 }
21793
21794 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21795 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21796 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21797 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21798 assert(!Subtarget.hasVLX() && "Unexpected features!");
21799 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21800 // Need to concat with zero vector for strict fp to avoid spurious
21801 // exceptions.
21802 // TODO: Should we just do this for non-strict as well?
21803 SDValue Tmp =
21804 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21805 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21806 DAG.getVectorIdxConstant(0, dl));
21807
21808 if (IsStrict) {
21809 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21810 {Chain, Src});
21811 Chain = Res.getValue(1);
21812 } else {
21813 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21814 }
21815
21816 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21817 DAG.getVectorIdxConstant(0, dl));
21818
21819 if (IsStrict)
21820 return DAG.getMergeValues({Res, Chain}, dl);
21821 return Res;
21822 }
21823
21824 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21825 if (!Subtarget.hasVLX()) {
21826 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21827 // legalizer and then widened again by vector op legalization.
21828 if (!IsStrict)
21829 return SDValue();
21830
21831 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21832 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21833 {Src, Zero, Zero, Zero});
21834 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21835 {Chain, Tmp});
21836 SDValue Chain = Tmp.getValue(1);
21837 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21838 DAG.getVectorIdxConstant(0, dl));
21839 return DAG.getMergeValues({Tmp, Chain}, dl);
21840 }
21841
21842 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21843 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21844 DAG.getUNDEF(MVT::v2f32));
21845 if (IsStrict) {
21846 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21848 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21849 }
21850 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21851 return DAG.getNode(Opc, dl, VT, Tmp);
21852 }
21853
21854 // Generate optimized instructions for pre AVX512 unsigned conversions from
21855 // vXf32 to vXi32.
21856 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21857 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21858 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21859 assert(!IsSigned && "Expected unsigned conversion!");
21860 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21861 }
21862
21863 return SDValue();
21864 }
21865
21866 assert(!VT.isVector());
21867
21868 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21869
21870 if (!IsSigned && UseSSEReg) {
21871 // Conversions from f32/f64 with AVX512 should be legal.
21872 if (Subtarget.hasAVX512())
21873 return Op;
21874
21875 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21876 // behaves on out of range inputs to generate optimized conversions.
21877 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21878 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21879 unsigned DstBits = VT.getScalarSizeInBits();
21880 APInt UIntLimit = APInt::getSignMask(DstBits);
21881 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21882 DAG.getConstant(UIntLimit, dl, VT));
21883 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21884
21885 // Calculate the converted result for values in the range:
21886 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21887 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21888 SDValue Small =
21889 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21890 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21891 SDValue Big = DAG.getNode(
21892 X86ISD::CVTTS2SI, dl, VT,
21893 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21894 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21895
21896 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21897 // and only if the value was out of range. So we can use that
21898 // as our indicator that we rather use "Big" instead of "Small".
21899 //
21900 // Use "Small" if "IsOverflown" has all bits cleared
21901 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21902 SDValue IsOverflown = DAG.getNode(
21903 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21904 return DAG.getNode(ISD::OR, dl, VT, Small,
21905 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21906 }
21907
21908 // Use default expansion for i64.
21909 if (VT == MVT::i64)
21910 return SDValue();
21911
21912 assert(VT == MVT::i32 && "Unexpected VT!");
21913
21914 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21915 // FIXME: This does not generate an invalid exception if the input does not
21916 // fit in i32. PR44019
21917 if (Subtarget.is64Bit()) {
21918 if (IsStrict) {
21919 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21920 {Chain, Src});
21921 Chain = Res.getValue(1);
21922 } else
21923 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21924
21925 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21926 if (IsStrict)
21927 return DAG.getMergeValues({Res, Chain}, dl);
21928 return Res;
21929 }
21930
21931 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21932 // use fisttp which will be handled later.
21933 if (!Subtarget.hasSSE3())
21934 return SDValue();
21935 }
21936
21937 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21938 // FIXME: This does not generate an invalid exception if the input does not
21939 // fit in i16. PR44019
21940 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21941 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21942 if (IsStrict) {
21943 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21944 {Chain, Src});
21945 Chain = Res.getValue(1);
21946 } else
21947 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21948
21949 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21950 if (IsStrict)
21951 return DAG.getMergeValues({Res, Chain}, dl);
21952 return Res;
21953 }
21954
21955 // If this is a FP_TO_SINT using SSEReg we're done.
21956 if (UseSSEReg && IsSigned)
21957 return Op;
21958
21959 // fp128 needs to use a libcall.
21960 if (SrcVT == MVT::f128) {
21961 RTLIB::Libcall LC;
21962 if (IsSigned)
21963 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21964 else
21965 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21966
21967 MakeLibCallOptions CallOptions;
21968 std::pair<SDValue, SDValue> Tmp =
21969 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21970
21971 if (IsStrict)
21972 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21973
21974 return Tmp.first;
21975 }
21976
21977 // Fall back to X87.
21978 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21979 if (IsStrict)
21980 return DAG.getMergeValues({V, Chain}, dl);
21981 return V;
21982 }
21983
21984 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21985}
21986
21987SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21988 SelectionDAG &DAG) const {
21989 SDValue Src = Op.getOperand(0);
21990 EVT DstVT = Op.getSimpleValueType();
21991 MVT SrcVT = Src.getSimpleValueType();
21992
21993 if (SrcVT.isVector())
21994 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21995
21996 if (SrcVT == MVT::f16)
21997 return SDValue();
21998
21999 // If the source is in an SSE register, the node is Legal.
22000 if (isScalarFPTypeInSSEReg(SrcVT))
22001 return Op;
22002
22003 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22004}
22005
22006SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22007 SelectionDAG &DAG) const {
22008 EVT DstVT = N->getValueType(0);
22009 SDValue Src = N->getOperand(0);
22010 EVT SrcVT = Src.getValueType();
22011
22012 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22013 // f16 must be promoted before using the lowering in this routine.
22014 // fp128 does not use this lowering.
22015 return SDValue();
22016 }
22017
22018 SDLoc DL(N);
22019 SDValue Chain = DAG.getEntryNode();
22020
22021 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22022
22023 // If we're converting from SSE, the stack slot needs to hold both types.
22024 // Otherwise it only needs to hold the DstVT.
22025 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22026 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22027 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22028 MachinePointerInfo MPI =
22030
22031 if (UseSSE) {
22032 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22033 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22034 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22035 SDValue Ops[] = { Chain, StackPtr };
22036
22037 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22038 /*Align*/ std::nullopt,
22040 Chain = Src.getValue(1);
22041 }
22042
22043 SDValue StoreOps[] = { Chain, Src, StackPtr };
22044 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22045 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22047
22048 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22049}
22050
22051SDValue
22052X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22053 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22054 // but making use of X86 specifics to produce better instruction sequences.
22055 SDNode *Node = Op.getNode();
22056 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22057 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22058 SDLoc dl(SDValue(Node, 0));
22059 SDValue Src = Node->getOperand(0);
22060
22061 // There are three types involved here: SrcVT is the source floating point
22062 // type, DstVT is the type of the result, and TmpVT is the result of the
22063 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22064 // DstVT).
22065 EVT SrcVT = Src.getValueType();
22066 EVT DstVT = Node->getValueType(0);
22067 EVT TmpVT = DstVT;
22068
22069 // This code is only for floats and doubles. Fall back to generic code for
22070 // anything else.
22071 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22072 return SDValue();
22073
22074 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22075 unsigned SatWidth = SatVT.getScalarSizeInBits();
22076 unsigned DstWidth = DstVT.getScalarSizeInBits();
22077 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22078 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22079 "Expected saturation width smaller than result width");
22080
22081 // Promote result of FP_TO_*INT to at least 32 bits.
22082 if (TmpWidth < 32) {
22083 TmpVT = MVT::i32;
22084 TmpWidth = 32;
22085 }
22086
22087 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22088 // us to use a native signed conversion instead.
22089 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22090 TmpVT = MVT::i64;
22091 TmpWidth = 64;
22092 }
22093
22094 // If the saturation width is smaller than the size of the temporary result,
22095 // we can always use signed conversion, which is native.
22096 if (SatWidth < TmpWidth)
22097 FpToIntOpcode = ISD::FP_TO_SINT;
22098
22099 // Determine minimum and maximum integer values and their corresponding
22100 // floating-point values.
22101 APInt MinInt, MaxInt;
22102 if (IsSigned) {
22103 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22104 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22105 } else {
22106 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22107 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22108 }
22109
22110 const fltSemantics &Sem = SrcVT.getFltSemantics();
22111 APFloat MinFloat(Sem);
22112 APFloat MaxFloat(Sem);
22113
22114 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22115 MinInt, IsSigned, APFloat::rmTowardZero);
22116 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22117 MaxInt, IsSigned, APFloat::rmTowardZero);
22118 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22119 && !(MaxStatus & APFloat::opStatus::opInexact);
22120
22121 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22122 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22123
22124 // If the integer bounds are exactly representable as floats, emit a
22125 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22126 if (AreExactFloatBounds) {
22127 if (DstVT != TmpVT) {
22128 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22129 SDValue MinClamped = DAG.getNode(
22130 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22131 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22132 SDValue BothClamped = DAG.getNode(
22133 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22134 // Convert clamped value to integer.
22135 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22136
22137 // NaN will become INDVAL, with the top bit set and the rest zero.
22138 // Truncation will discard the top bit, resulting in zero.
22139 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22140 }
22141
22142 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22143 SDValue MinClamped = DAG.getNode(
22144 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22145 // Clamp by MaxFloat from above. NaN cannot occur.
22146 SDValue BothClamped = DAG.getNode(
22147 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22148 // Convert clamped value to integer.
22149 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22150
22151 if (!IsSigned) {
22152 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22153 // which is zero.
22154 return FpToInt;
22155 }
22156
22157 // Otherwise, select zero if Src is NaN.
22158 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22159 return DAG.getSelectCC(
22160 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22161 }
22162
22163 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22164 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22165
22166 // Result of direct conversion, which may be selected away.
22167 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22168
22169 if (DstVT != TmpVT) {
22170 // NaN will become INDVAL, with the top bit set and the rest zero.
22171 // Truncation will discard the top bit, resulting in zero.
22172 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22173 }
22174
22175 SDValue Select = FpToInt;
22176 // For signed conversions where we saturate to the same size as the
22177 // result type of the fptoi instructions, INDVAL coincides with integer
22178 // minimum, so we don't need to explicitly check it.
22179 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22180 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22181 // MinInt if Src is NaN.
22182 Select = DAG.getSelectCC(
22183 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22184 }
22185
22186 // If Src OGT MaxFloat, select MaxInt.
22187 Select = DAG.getSelectCC(
22188 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22189
22190 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22191 // is already zero. The promoted case was already handled above.
22192 if (!IsSigned || DstVT != TmpVT) {
22193 return Select;
22194 }
22195
22196 // Otherwise, select 0 if Src is NaN.
22197 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22198 return DAG.getSelectCC(
22199 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22200}
22201
22202SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22203 bool IsStrict = Op->isStrictFPOpcode();
22204
22205 SDLoc DL(Op);
22206 MVT VT = Op.getSimpleValueType();
22207 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22208 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22209 MVT SVT = In.getSimpleValueType();
22210
22211 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22212 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22213 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22214 !Subtarget.getTargetTriple().isOSDarwin()))
22215 return SDValue();
22216
22217 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22218 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22219 return Op;
22220
22221 if (SVT == MVT::f16) {
22222 if (Subtarget.hasFP16())
22223 return Op;
22224
22225 if (VT != MVT::f32) {
22226 if (IsStrict)
22227 return DAG.getNode(
22228 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22229 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22230 {MVT::f32, MVT::Other}, {Chain, In})});
22231
22232 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22233 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22234 }
22235
22236 if (!Subtarget.hasF16C()) {
22237 if (!Subtarget.getTargetTriple().isOSDarwin())
22238 return SDValue();
22239
22240 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22241
22242 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22243 TargetLowering::CallLoweringInfo CLI(DAG);
22244 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22245
22246 In = DAG.getBitcast(MVT::i16, In);
22248 TargetLowering::ArgListEntry Entry(
22249 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22250 Entry.IsSExt = false;
22251 Entry.IsZExt = true;
22252 Args.push_back(Entry);
22253
22255 getLibcallName(RTLIB::FPEXT_F16_F32),
22257 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22258 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22259 std::move(Args));
22260
22261 SDValue Res;
22262 std::tie(Res,Chain) = LowerCallTo(CLI);
22263 if (IsStrict)
22264 Res = DAG.getMergeValues({Res, Chain}, DL);
22265
22266 return Res;
22267 }
22268
22269 In = DAG.getBitcast(MVT::i16, In);
22270 SDValue Res;
22271 if (IsStrict) {
22272 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22273 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22274 DAG.getVectorIdxConstant(0, DL));
22275 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22276 {Chain, In});
22277 Chain = Res.getValue(1);
22278 } else {
22279 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22280 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22281 DAG.getUNDEF(MVT::v4i32), In,
22282 DAG.getVectorIdxConstant(0, DL));
22283 In = DAG.getBitcast(MVT::v8i16, In);
22284 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22285 DAG.getTargetConstant(4, DL, MVT::i32));
22286 }
22287 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22288 DAG.getVectorIdxConstant(0, DL));
22289 if (IsStrict)
22290 return DAG.getMergeValues({Res, Chain}, DL);
22291 return Res;
22292 }
22293
22294 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22295 return Op;
22296
22297 if (SVT.getVectorElementType() == MVT::f16) {
22298 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22299 return Op;
22300 assert(Subtarget.hasF16C() && "Unexpected features!");
22301 if (SVT == MVT::v2f16)
22302 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22303 DAG.getUNDEF(MVT::v2f16));
22304 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22305 DAG.getUNDEF(MVT::v4f16));
22306 if (IsStrict)
22307 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22308 {Op->getOperand(0), Res});
22309 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22310 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22311 return Op;
22312 }
22313
22314 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22315
22316 SDValue Res =
22317 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22318 if (IsStrict)
22319 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22320 {Op->getOperand(0), Res});
22321 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22322}
22323
22324SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22325 bool IsStrict = Op->isStrictFPOpcode();
22326
22327 SDLoc DL(Op);
22328 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22329 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22330 MVT VT = Op.getSimpleValueType();
22331 MVT SVT = In.getSimpleValueType();
22332
22333 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22334 return SDValue();
22335
22336 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22337 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22338 if (!Subtarget.getTargetTriple().isOSDarwin())
22339 return SDValue();
22340
22341 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22342 TargetLowering::CallLoweringInfo CLI(DAG);
22343 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22344
22346 TargetLowering::ArgListEntry Entry(
22347 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22348 Entry.IsSExt = false;
22349 Entry.IsZExt = true;
22350 Args.push_back(Entry);
22351
22353 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22354 : RTLIB::FPROUND_F32_F16),
22356 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22357 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22358 std::move(Args));
22359
22360 SDValue Res;
22361 std::tie(Res, Chain) = LowerCallTo(CLI);
22362
22363 Res = DAG.getBitcast(MVT::f16, Res);
22364
22365 if (IsStrict)
22366 Res = DAG.getMergeValues({Res, Chain}, DL);
22367
22368 return Res;
22369 }
22370
22371 if (VT.getScalarType() == MVT::bf16) {
22372 if (SVT.getScalarType() == MVT::f32 &&
22373 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22374 Subtarget.hasAVXNECONVERT()))
22375 return Op;
22376 return SDValue();
22377 }
22378
22379 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22380 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22381 return SDValue();
22382
22383 if (VT.isVector())
22384 return Op;
22385
22386 SDValue Res;
22388 MVT::i32);
22389 if (IsStrict) {
22390 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22391 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22392 DAG.getVectorIdxConstant(0, DL));
22393 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22394 {Chain, Res, Rnd});
22395 Chain = Res.getValue(1);
22396 } else {
22397 // FIXME: Should we use zeros for upper elements for non-strict?
22398 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22399 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22400 }
22401
22402 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22403 DAG.getVectorIdxConstant(0, DL));
22404 Res = DAG.getBitcast(MVT::f16, Res);
22405
22406 if (IsStrict)
22407 return DAG.getMergeValues({Res, Chain}, DL);
22408
22409 return Res;
22410 }
22411
22412 return Op;
22413}
22414
22416 bool IsStrict = Op->isStrictFPOpcode();
22417 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22418 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22419 "Unexpected VT!");
22420
22421 SDLoc dl(Op);
22422 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22423 DAG.getConstant(0, dl, MVT::v8i16), Src,
22424 DAG.getVectorIdxConstant(0, dl));
22425
22426 SDValue Chain;
22427 if (IsStrict) {
22428 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22429 {Op.getOperand(0), Res});
22430 Chain = Res.getValue(1);
22431 } else {
22432 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22433 }
22434
22435 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22436 DAG.getVectorIdxConstant(0, dl));
22437
22438 if (IsStrict)
22439 return DAG.getMergeValues({Res, Chain}, dl);
22440
22441 return Res;
22442}
22443
22445 bool IsStrict = Op->isStrictFPOpcode();
22446 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22447 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22448 "Unexpected VT!");
22449
22450 SDLoc dl(Op);
22451 SDValue Res, Chain;
22452 if (IsStrict) {
22453 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22454 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22455 DAG.getVectorIdxConstant(0, dl));
22456 Res = DAG.getNode(
22457 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22458 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22459 Chain = Res.getValue(1);
22460 } else {
22461 // FIXME: Should we use zeros for upper elements for non-strict?
22462 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22463 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22464 DAG.getTargetConstant(4, dl, MVT::i32));
22465 }
22466
22467 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22468 DAG.getVectorIdxConstant(0, dl));
22469
22470 if (IsStrict)
22471 return DAG.getMergeValues({Res, Chain}, dl);
22472
22473 return Res;
22474}
22475
22476SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22477 SelectionDAG &DAG) const {
22478 SDLoc DL(Op);
22479
22480 MVT SVT = Op.getOperand(0).getSimpleValueType();
22481 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22482 Subtarget.hasAVXNECONVERT())) {
22483 SDValue Res;
22484 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22485 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22486 Res = DAG.getBitcast(MVT::v8i16, Res);
22487 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22488 DAG.getVectorIdxConstant(0, DL));
22489 }
22490
22491 MakeLibCallOptions CallOptions;
22492 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22493 SDValue Res =
22494 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22495 return DAG.getBitcast(MVT::i16, Res);
22496}
22497
22498/// Depending on uarch and/or optimizing for size, we might prefer to use a
22499/// vector operation in place of the typical scalar operation.
22501 SelectionDAG &DAG,
22502 const X86Subtarget &Subtarget) {
22503 // If both operands have other uses, this is probably not profitable.
22504 SDValue LHS = Op.getOperand(0);
22505 SDValue RHS = Op.getOperand(1);
22506 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22507 return Op;
22508
22509 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22510 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22511 if (IsFP && !Subtarget.hasSSE3())
22512 return Op;
22513 if (!IsFP && !Subtarget.hasSSSE3())
22514 return Op;
22515
22516 // Extract from a common vector.
22517 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22518 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22519 LHS.getOperand(0) != RHS.getOperand(0) ||
22520 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22521 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22522 !shouldUseHorizontalOp(true, DAG, Subtarget))
22523 return Op;
22524
22525 // Allow commuted 'hadd' ops.
22526 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22527 unsigned HOpcode;
22528 switch (Op.getOpcode()) {
22529 // clang-format off
22530 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22531 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22532 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22533 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22534 default:
22535 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22536 // clang-format on
22537 }
22538 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22539 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22540 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22541 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22542 std::swap(LExtIndex, RExtIndex);
22543
22544 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22545 return Op;
22546
22547 SDValue X = LHS.getOperand(0);
22548 EVT VecVT = X.getValueType();
22549 unsigned BitWidth = VecVT.getSizeInBits();
22550 unsigned NumLanes = BitWidth / 128;
22551 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22552 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22553 "Not expecting illegal vector widths here");
22554
22555 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22556 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22557 if (BitWidth == 256 || BitWidth == 512) {
22558 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22559 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22560 LExtIndex %= NumEltsPerLane;
22561 }
22562
22563 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22564 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22565 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22566 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22567 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22568 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22569 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22570}
22571
22572/// Depending on uarch and/or optimizing for size, we might prefer to use a
22573/// vector operation in place of the typical scalar operation.
22574SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22575 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22576 "Only expecting float/double");
22577 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22578}
22579
22580/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22581/// This mode isn't supported in hardware on X86. But as long as we aren't
22582/// compiling with trapping math, we can emulate this with
22583/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22585 SDValue N0 = Op.getOperand(0);
22586 SDLoc dl(Op);
22587 MVT VT = Op.getSimpleValueType();
22588
22589 // N0 += copysign(nextafter(0.5, 0.0), N0)
22590 const fltSemantics &Sem = VT.getFltSemantics();
22591 bool Ignored;
22592 APFloat Point5Pred = APFloat(0.5f);
22593 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22594 Point5Pred.next(/*nextDown*/true);
22595
22596 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22597 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22598 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22599
22600 // Truncate the result to remove fraction.
22601 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22602}
22603
22604/// The only differences between FABS and FNEG are the mask and the logic op.
22605/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22607 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22608 "Wrong opcode for lowering FABS or FNEG.");
22609
22610 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22611
22612 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22613 // into an FNABS. We'll lower the FABS after that if it is still in use.
22614 if (IsFABS)
22615 for (SDNode *User : Op->users())
22616 if (User->getOpcode() == ISD::FNEG)
22617 return Op;
22618
22619 SDLoc dl(Op);
22620 MVT VT = Op.getSimpleValueType();
22621
22622 bool IsF128 = (VT == MVT::f128);
22623 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22625 "Unexpected type in LowerFABSorFNEG");
22626
22627 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22628 // decide if we should generate a 16-byte constant mask when we only need 4 or
22629 // 8 bytes for the scalar case.
22630
22631 // There are no scalar bitwise logical SSE/AVX instructions, so we
22632 // generate a 16-byte vector constant and logic op even for the scalar case.
22633 // Using a 16-byte mask allows folding the load of the mask with
22634 // the logic op, so it can save (~4 bytes) on code size.
22635 bool IsFakeVector = !VT.isVector() && !IsF128;
22636 MVT LogicVT = VT;
22637 if (IsFakeVector)
22638 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22639 : (VT == MVT::f32) ? MVT::v4f32
22640 : MVT::v8f16;
22641
22642 unsigned EltBits = VT.getScalarSizeInBits();
22643 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22644 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22645 APInt::getSignMask(EltBits);
22646 const fltSemantics &Sem = VT.getFltSemantics();
22647 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22648
22649 SDValue Op0 = Op.getOperand(0);
22650 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22651 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22652 IsFNABS ? X86ISD::FOR :
22654 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22655
22656 if (VT.isVector() || IsF128)
22657 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22658
22659 // For the scalar case extend to a 128-bit vector, perform the logic op,
22660 // and extract the scalar result back out.
22661 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22662 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22663 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22664 DAG.getVectorIdxConstant(0, dl));
22665}
22666
22668 SDValue Mag = Op.getOperand(0);
22669 SDValue Sign = Op.getOperand(1);
22670 SDLoc dl(Op);
22671
22672 // If the sign operand is smaller, extend it first.
22673 MVT VT = Op.getSimpleValueType();
22674 if (Sign.getSimpleValueType().bitsLT(VT))
22675 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22676
22677 // And if it is bigger, shrink it first.
22678 if (Sign.getSimpleValueType().bitsGT(VT))
22679 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22680 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22681
22682 // At this point the operands and the result should have the same
22683 // type, and that won't be f80 since that is not custom lowered.
22684 bool IsF128 = (VT == MVT::f128);
22685 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22687 "Unexpected type in LowerFCOPYSIGN");
22688
22689 const fltSemantics &Sem = VT.getFltSemantics();
22690
22691 // Perform all scalar logic operations as 16-byte vectors because there are no
22692 // scalar FP logic instructions in SSE.
22693 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22694 // unnecessary splats, but we might miss load folding opportunities. Should
22695 // this decision be based on OptimizeForSize?
22696 bool IsFakeVector = !VT.isVector() && !IsF128;
22697 MVT LogicVT = VT;
22698 if (IsFakeVector)
22699 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22700 : (VT == MVT::f32) ? MVT::v4f32
22701 : MVT::v8f16;
22702
22703 // The mask constants are automatically splatted for vector types.
22704 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22705 SDValue SignMask = DAG.getConstantFP(
22706 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22707 SDValue MagMask = DAG.getConstantFP(
22708 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22709
22710 // First, clear all bits but the sign bit from the second operand (sign).
22711 if (IsFakeVector)
22712 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22713 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22714
22715 // Next, clear the sign bit from the first operand (magnitude).
22716 // TODO: If we had general constant folding for FP logic ops, this check
22717 // wouldn't be necessary.
22718 SDValue MagBits;
22719 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22720 APFloat APF = Op0CN->getValueAPF();
22721 APF.clearSign();
22722 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22723 } else {
22724 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22725 if (IsFakeVector)
22726 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22727 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22728 }
22729
22730 // OR the magnitude value with the sign bit.
22731 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22732 return !IsFakeVector ? Or
22733 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22734 DAG.getVectorIdxConstant(0, dl));
22735}
22736
22738 SDValue N0 = Op.getOperand(0);
22739 SDLoc dl(Op);
22740 MVT VT = Op.getSimpleValueType();
22741
22742 MVT OpVT = N0.getSimpleValueType();
22743 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22744 "Unexpected type for FGETSIGN");
22745
22746 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22747 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22748 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22749 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22750 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22751 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22752 return Res;
22753}
22754
22755/// Helper for attempting to create a X86ISD::BT node.
22756static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22757 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22758 // instruction. Since the shift amount is in-range-or-undefined, we know
22759 // that doing a bittest on the i32 value is ok. We extend to i32 because
22760 // the encoding for the i16 version is larger than the i32 version.
22761 // Also promote i16 to i32 for performance / code size reason.
22762 if (Src.getValueType().getScalarSizeInBits() < 32)
22763 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22764
22765 // No legal type found, give up.
22766 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22767 return SDValue();
22768
22769 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22770 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22771 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22772 // known to be zero.
22773 if (Src.getValueType() == MVT::i64 &&
22774 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22775 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22776
22777 // If the operand types disagree, extend the shift amount to match. Since
22778 // BT ignores high bits (like shifts) we can use anyextend.
22779 if (Src.getValueType() != BitNo.getValueType()) {
22780 // Peek through a mask/modulo operation.
22781 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22782 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22783 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22784 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22785 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22786 BitNo.getOperand(0)),
22787 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22788 BitNo.getOperand(1)));
22789 else
22790 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22791 }
22792
22793 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22794}
22795
22796/// Helper for creating a X86ISD::SETCC node.
22798 SelectionDAG &DAG) {
22799 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22800 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22801}
22802
22803/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22804/// recognizable memcmp expansion.
22805static bool isOrXorXorTree(SDValue X, bool Root = true) {
22806 if (X.getOpcode() == ISD::OR)
22807 return isOrXorXorTree(X.getOperand(0), false) &&
22808 isOrXorXorTree(X.getOperand(1), false);
22809 if (Root)
22810 return false;
22811 return X.getOpcode() == ISD::XOR;
22812}
22813
22814/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22815/// expansion.
22816template <typename F>
22818 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22819 SDValue Op0 = X.getOperand(0);
22820 SDValue Op1 = X.getOperand(1);
22821 if (X.getOpcode() == ISD::OR) {
22822 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22823 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22824 if (VecVT != CmpVT)
22825 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22826 if (HasPT)
22827 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22828 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22829 }
22830 if (X.getOpcode() == ISD::XOR) {
22831 SDValue A = SToV(Op0);
22832 SDValue B = SToV(Op1);
22833 if (VecVT != CmpVT)
22834 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22835 if (HasPT)
22836 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22837 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22838 }
22839 llvm_unreachable("Impossible");
22840}
22841
22842/// Try to map a 128-bit or larger integer comparison to vector instructions
22843/// before type legalization splits it up into chunks.
22845 ISD::CondCode CC,
22846 const SDLoc &DL,
22847 SelectionDAG &DAG,
22848 const X86Subtarget &Subtarget) {
22849 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22850
22851 // We're looking for an oversized integer equality comparison.
22852 EVT OpVT = X.getValueType();
22853 unsigned OpSize = OpVT.getSizeInBits();
22854 if (!OpVT.isScalarInteger() || OpSize < 128)
22855 return SDValue();
22856
22857 // Ignore a comparison with zero because that gets special treatment in
22858 // EmitTest(). But make an exception for the special case of a pair of
22859 // logically-combined vector-sized operands compared to zero. This pattern may
22860 // be generated by the memcmp expansion pass with oversized integer compares
22861 // (see PR33325).
22862 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22863 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22864 return SDValue();
22865
22866 // Don't perform this combine if constructing the vector will be expensive.
22867 auto IsVectorBitCastCheap = [](SDValue X) {
22869 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22870 X.getOpcode() == ISD::LOAD;
22871 };
22872 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22873 !IsOrXorXorTreeCCZero)
22874 return SDValue();
22875
22876 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22877 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22878 // Otherwise use PCMPEQ (plus AND) and mask testing.
22879 bool NoImplicitFloatOps =
22881 Attribute::NoImplicitFloat);
22882 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22883 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22884 (OpSize == 256 && Subtarget.hasAVX()) ||
22885 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22886 bool HasPT = Subtarget.hasSSE41();
22887
22888 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22889 // vector registers are essentially free. (Technically, widening registers
22890 // prevents load folding, but the tradeoff is worth it.)
22891 bool PreferKOT = Subtarget.preferMaskRegisters();
22892 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22893
22894 EVT VecVT = MVT::v16i8;
22895 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22896 if (OpSize == 256) {
22897 VecVT = MVT::v32i8;
22898 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22899 }
22900 EVT CastVT = VecVT;
22901 bool NeedsAVX512FCast = false;
22902 if (OpSize == 512 || NeedZExt) {
22903 if (Subtarget.hasBWI()) {
22904 VecVT = MVT::v64i8;
22905 CmpVT = MVT::v64i1;
22906 if (OpSize == 512)
22907 CastVT = VecVT;
22908 } else {
22909 VecVT = MVT::v16i32;
22910 CmpVT = MVT::v16i1;
22911 CastVT = OpSize == 512 ? VecVT
22912 : OpSize == 256 ? MVT::v8i32
22913 : MVT::v4i32;
22914 NeedsAVX512FCast = true;
22915 }
22916 }
22917
22918 auto ScalarToVector = [&](SDValue X) -> SDValue {
22919 bool TmpZext = false;
22920 EVT TmpCastVT = CastVT;
22921 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22922 SDValue OrigX = X.getOperand(0);
22923 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22924 if (OrigSize < OpSize) {
22925 if (OrigSize == 128) {
22926 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22927 X = OrigX;
22928 TmpZext = true;
22929 } else if (OrigSize == 256) {
22930 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22931 X = OrigX;
22932 TmpZext = true;
22933 }
22934 }
22935 }
22936 X = DAG.getBitcast(TmpCastVT, X);
22937 if (!NeedZExt && !TmpZext)
22938 return X;
22939 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22940 DAG.getConstant(0, DL, VecVT), X,
22941 DAG.getVectorIdxConstant(0, DL));
22942 };
22943
22944 SDValue Cmp;
22945 if (IsOrXorXorTreeCCZero) {
22946 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22947 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22948 // Use 2 vector equality compares and 'and' the results before doing a
22949 // MOVMSK.
22950 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22951 } else {
22952 SDValue VecX = ScalarToVector(X);
22953 SDValue VecY = ScalarToVector(Y);
22954 if (VecVT != CmpVT) {
22955 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22956 } else if (HasPT) {
22957 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22958 } else {
22959 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22960 }
22961 }
22962 // AVX512 should emit a setcc that will lower to kortest.
22963 if (VecVT != CmpVT) {
22964 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22965 : CmpVT == MVT::v32i1 ? MVT::i32
22966 : MVT::i16;
22967 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22968 DAG.getConstant(0, DL, KRegVT), CC);
22969 }
22970 if (HasPT) {
22971 SDValue BCCmp =
22972 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22973 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22975 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22976 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22977 }
22978 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22979 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22980 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22981 assert(Cmp.getValueType() == MVT::v16i8 &&
22982 "Non 128-bit vector on pre-SSE41 target");
22983 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22984 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22985 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22986 }
22987
22988 return SDValue();
22989}
22990
22991/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22992/// style scalarized (associative) reduction patterns. Partial reductions
22993/// are supported when the pointer SrcMask is non-null.
22994/// TODO - move this to SelectionDAG?
22997 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22999 DenseMap<SDValue, APInt> SrcOpMap;
23000 EVT VT = MVT::Other;
23001
23002 // Recognize a special case where a vector is casted into wide integer to
23003 // test all 0s.
23004 assert(Op.getOpcode() == unsigned(BinOp) &&
23005 "Unexpected bit reduction opcode");
23006 Opnds.push_back(Op.getOperand(0));
23007 Opnds.push_back(Op.getOperand(1));
23008
23009 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23011 // BFS traverse all BinOp operands.
23012 if (I->getOpcode() == unsigned(BinOp)) {
23013 Opnds.push_back(I->getOperand(0));
23014 Opnds.push_back(I->getOperand(1));
23015 // Re-evaluate the number of nodes to be traversed.
23016 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23017 continue;
23018 }
23019
23020 // Quit if a non-EXTRACT_VECTOR_ELT
23021 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23022 return false;
23023
23024 // Quit if without a constant index.
23025 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23026 if (!Idx)
23027 return false;
23028
23029 SDValue Src = I->getOperand(0);
23030 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23031 if (M == SrcOpMap.end()) {
23032 VT = Src.getValueType();
23033 // Quit if not the same type.
23034 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23035 return false;
23036 unsigned NumElts = VT.getVectorNumElements();
23037 APInt EltCount = APInt::getZero(NumElts);
23038 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23039 SrcOps.push_back(Src);
23040 }
23041
23042 // Quit if element already used.
23043 unsigned CIdx = Idx->getZExtValue();
23044 if (M->second[CIdx])
23045 return false;
23046 M->second.setBit(CIdx);
23047 }
23048
23049 if (SrcMask) {
23050 // Collect the source partial masks.
23051 for (SDValue &SrcOp : SrcOps)
23052 SrcMask->push_back(SrcOpMap[SrcOp]);
23053 } else {
23054 // Quit if not all elements are used.
23055 for (const auto &I : SrcOpMap)
23056 if (!I.second.isAllOnes())
23057 return false;
23058 }
23059
23060 return true;
23061}
23062
23063// Helper function for comparing all bits of two vectors.
23065 ISD::CondCode CC, const APInt &OriginalMask,
23066 const X86Subtarget &Subtarget,
23067 SelectionDAG &DAG, X86::CondCode &X86CC) {
23068 EVT VT = LHS.getValueType();
23069 unsigned ScalarSize = VT.getScalarSizeInBits();
23070 if (OriginalMask.getBitWidth() != ScalarSize) {
23071 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23072 return SDValue();
23073 }
23074
23075 // Quit if not convertable to legal scalar or 128/256-bit vector.
23077 return SDValue();
23078
23079 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23080 if (VT.isFloatingPoint())
23081 return SDValue();
23082
23083 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23084 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23085
23086 APInt Mask = OriginalMask;
23087
23088 auto MaskBits = [&](SDValue Src) {
23089 if (Mask.isAllOnes())
23090 return Src;
23091 EVT SrcVT = Src.getValueType();
23092 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23093 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23094 };
23095
23096 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23097 if (VT.getSizeInBits() < 128) {
23098 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23099 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23100 if (IntVT != MVT::i64)
23101 return SDValue();
23102 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23103 MVT::i32, MVT::i32);
23104 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23105 MVT::i32, MVT::i32);
23106 SDValue Lo =
23107 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23108 SDValue Hi =
23109 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23110 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23111 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23112 DAG.getConstant(0, DL, MVT::i32));
23113 }
23114 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23115 DAG.getBitcast(IntVT, MaskBits(LHS)),
23116 DAG.getBitcast(IntVT, MaskBits(RHS)));
23117 }
23118
23119 // Without PTEST, a masked v2i64 or-reduction is not faster than
23120 // scalarization.
23121 bool UseKORTEST = Subtarget.useAVX512Regs();
23122 bool UsePTEST = Subtarget.hasSSE41();
23123 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23124 return SDValue();
23125
23126 // Split down to 128/256/512-bit vector.
23127 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23128
23129 // If the input vector has vector elements wider than the target test size,
23130 // then cast to <X x i64> so it will safely split.
23131 if (ScalarSize > TestSize) {
23132 if (!Mask.isAllOnes())
23133 return SDValue();
23134 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23135 LHS = DAG.getBitcast(VT, LHS);
23136 RHS = DAG.getBitcast(VT, RHS);
23137 Mask = APInt::getAllOnes(64);
23138 }
23139
23140 if (VT.getSizeInBits() > TestSize) {
23141 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23142 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23143 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23144 while (VT.getSizeInBits() > TestSize) {
23145 auto Split = DAG.SplitVector(LHS, DL);
23146 VT = Split.first.getValueType();
23147 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23148 }
23149 RHS = DAG.getAllOnesConstant(DL, VT);
23150 } else if (!UsePTEST && !KnownRHS.isZero()) {
23151 // MOVMSK Special Case:
23152 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23153 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23154 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23155 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23156 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23157 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23158 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23159 V = DAG.getSExtOrTrunc(V, DL, VT);
23160 while (VT.getSizeInBits() > TestSize) {
23161 auto Split = DAG.SplitVector(V, DL);
23162 VT = Split.first.getValueType();
23163 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23164 }
23165 V = DAG.getNOT(DL, V, VT);
23166 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23167 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23168 DAG.getConstant(0, DL, MVT::i32));
23169 } else {
23170 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23171 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23172 while (VT.getSizeInBits() > TestSize) {
23173 auto Split = DAG.SplitVector(V, DL);
23174 VT = Split.first.getValueType();
23175 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23176 }
23177 LHS = V;
23178 RHS = DAG.getConstant(0, DL, VT);
23179 }
23180 }
23181
23182 if (UseKORTEST && VT.is512BitVector()) {
23183 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23184 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23185 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23186 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23187 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23188 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23189 }
23190
23191 if (UsePTEST) {
23192 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23193 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23194 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23195 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23196 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23197 }
23198
23199 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23200 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23201 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23202 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23203 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23204 V = DAG.getNOT(DL, V, MaskVT);
23205 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23206 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23207 DAG.getConstant(0, DL, MVT::i32));
23208}
23209
23210// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23211// to CMP(MOVMSK(PCMPEQB(X,Y))).
23213 ISD::CondCode CC, const SDLoc &DL,
23214 const X86Subtarget &Subtarget,
23215 SelectionDAG &DAG,
23216 X86::CondCode &X86CC) {
23217 SDValue Op = OrigLHS;
23218
23219 bool CmpNull;
23220 APInt Mask;
23221 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23222 CmpNull = isNullConstant(OrigRHS);
23223 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23224 return SDValue();
23225
23226 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23227 return SDValue();
23228
23229 // Check whether we're masking/truncating an OR-reduction result, in which
23230 // case track the masked bits.
23231 // TODO: Add CmpAllOnes support.
23232 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23233 if (CmpNull) {
23234 switch (Op.getOpcode()) {
23235 case ISD::TRUNCATE: {
23236 SDValue Src = Op.getOperand(0);
23237 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23238 Op.getScalarValueSizeInBits());
23239 Op = Src;
23240 break;
23241 }
23242 case ISD::AND: {
23243 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23244 Mask = Cst->getAPIntValue();
23245 Op = Op.getOperand(0);
23246 }
23247 break;
23248 }
23249 }
23250 }
23251 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23252 CC = ISD::SETEQ;
23253 CmpNull = true;
23254 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23255 } else {
23256 return SDValue();
23257 }
23258
23259 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23260
23261 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23262 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23264 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23265 EVT VT = VecIns[0].getValueType();
23266 assert(llvm::all_of(VecIns,
23267 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23268 "Reduction source vector mismatch");
23269
23270 // Quit if not splittable to scalar/128/256/512-bit vector.
23272 return SDValue();
23273
23274 // If more than one full vector is evaluated, AND/OR them first before
23275 // PTEST.
23276 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23277 Slot += 2, e += 1) {
23278 // Each iteration will AND/OR 2 nodes and append the result until there is
23279 // only 1 node left, i.e. the final value of all vectors.
23280 SDValue LHS = VecIns[Slot];
23281 SDValue RHS = VecIns[Slot + 1];
23282 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23283 }
23284
23285 return LowerVectorAllEqual(DL, VecIns.back(),
23286 CmpNull ? DAG.getConstant(0, DL, VT)
23287 : DAG.getAllOnesConstant(DL, VT),
23288 CC, Mask, Subtarget, DAG, X86CC);
23289 }
23290
23291 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23292 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23293 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23294 ISD::NodeType BinOp;
23295 if (SDValue Match =
23296 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23297 EVT MatchVT = Match.getValueType();
23298 return LowerVectorAllEqual(DL, Match,
23299 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23300 : DAG.getAllOnesConstant(DL, MatchVT),
23301 CC, Mask, Subtarget, DAG, X86CC);
23302 }
23303 }
23304
23305 if (Mask.isAllOnes()) {
23306 assert(!Op.getValueType().isVector() &&
23307 "Illegal vector type for reduction pattern");
23309 if (Src.getValueType().isFixedLengthVector() &&
23310 Src.getValueType().getScalarType() == MVT::i1) {
23311 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23312 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23313 if (Src.getOpcode() == ISD::SETCC) {
23314 SDValue LHS = Src.getOperand(0);
23315 SDValue RHS = Src.getOperand(1);
23316 EVT LHSVT = LHS.getValueType();
23317 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23318 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23320 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23321 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23322 X86CC);
23323 }
23324 }
23325 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23326 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23327 // Peek through truncation, mask the LSB and compare against zero/LSB.
23328 if (Src.getOpcode() == ISD::TRUNCATE) {
23329 SDValue Inner = Src.getOperand(0);
23330 EVT InnerVT = Inner.getValueType();
23332 unsigned BW = InnerVT.getScalarSizeInBits();
23333 APInt SrcMask = APInt(BW, 1);
23334 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23335 return LowerVectorAllEqual(DL, Inner,
23336 DAG.getConstant(Cmp, DL, InnerVT), CC,
23337 SrcMask, Subtarget, DAG, X86CC);
23338 }
23339 }
23340 }
23341 }
23342
23343 return SDValue();
23344}
23345
23346/// return true if \c Op has a use that doesn't just read flags.
23348 for (SDUse &Use : Op->uses()) {
23349 SDNode *User = Use.getUser();
23350 unsigned UOpNo = Use.getOperandNo();
23351 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23352 // Look past truncate.
23353 UOpNo = User->use_begin()->getOperandNo();
23354 User = User->use_begin()->getUser();
23355 }
23356
23357 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23358 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23359 return true;
23360 }
23361 return false;
23362}
23363
23364// Transform to an x86-specific ALU node with flags if there is a chance of
23365// using an RMW op or only the flags are used. Otherwise, leave
23366// the node alone and emit a 'cmp' or 'test' instruction.
23368 for (SDNode *U : Op->users())
23369 if (U->getOpcode() != ISD::CopyToReg &&
23370 U->getOpcode() != ISD::SETCC &&
23371 U->getOpcode() != ISD::STORE)
23372 return false;
23373
23374 return true;
23375}
23376
23377/// Emit nodes that will be selected as "test Op0,Op0", or something
23378/// equivalent.
23380 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23381 // CF and OF aren't always set the way we want. Determine which
23382 // of these we need.
23383 bool NeedCF = false;
23384 bool NeedOF = false;
23385 switch (X86CC) {
23386 default: break;
23387 case X86::COND_A: case X86::COND_AE:
23388 case X86::COND_B: case X86::COND_BE:
23389 NeedCF = true;
23390 break;
23391 case X86::COND_G: case X86::COND_GE:
23392 case X86::COND_L: case X86::COND_LE:
23393 case X86::COND_O: case X86::COND_NO: {
23394 // Check if we really need to set the
23395 // Overflow flag. If NoSignedWrap is present
23396 // that is not actually needed.
23397 switch (Op->getOpcode()) {
23398 case ISD::ADD:
23399 case ISD::SUB:
23400 case ISD::MUL:
23401 case ISD::SHL:
23402 if (Op.getNode()->getFlags().hasNoSignedWrap())
23403 break;
23404 [[fallthrough]];
23405 default:
23406 NeedOF = true;
23407 break;
23408 }
23409 break;
23410 }
23411 }
23412 // See if we can use the EFLAGS value from the operand instead of
23413 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23414 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23415 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23416 // Emit a CMP with 0, which is the TEST pattern.
23417 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23418 DAG.getConstant(0, dl, Op.getValueType()));
23419 }
23420 unsigned Opcode = 0;
23421 unsigned NumOperands = 0;
23422
23423 SDValue ArithOp = Op;
23424
23425 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23426 // which may be the result of a CAST. We use the variable 'Op', which is the
23427 // non-casted variable when we check for possible users.
23428 switch (ArithOp.getOpcode()) {
23429 case ISD::AND:
23430 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23431 // because a TEST instruction will be better.
23432 if (!hasNonFlagsUse(Op))
23433 break;
23434
23435 [[fallthrough]];
23436 case ISD::ADD:
23437 case ISD::SUB:
23438 case ISD::OR:
23439 case ISD::XOR:
23441 break;
23442
23443 // Otherwise use a regular EFLAGS-setting instruction.
23444 switch (ArithOp.getOpcode()) {
23445 // clang-format off
23446 default: llvm_unreachable("unexpected operator!");
23447 case ISD::ADD: Opcode = X86ISD::ADD; break;
23448 case ISD::SUB: Opcode = X86ISD::SUB; break;
23449 case ISD::XOR: Opcode = X86ISD::XOR; break;
23450 case ISD::AND: Opcode = X86ISD::AND; break;
23451 case ISD::OR: Opcode = X86ISD::OR; break;
23452 // clang-format on
23453 }
23454
23455 NumOperands = 2;
23456 break;
23457 case X86ISD::ADD:
23458 case X86ISD::SUB:
23459 case X86ISD::OR:
23460 case X86ISD::XOR:
23461 case X86ISD::AND:
23462 return SDValue(Op.getNode(), 1);
23463 case ISD::SSUBO:
23464 case ISD::USUBO: {
23465 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23466 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23467 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23468 Op->getOperand(1)).getValue(1);
23469 }
23470 default:
23471 break;
23472 }
23473
23474 if (Opcode == 0) {
23475 // Emit a CMP with 0, which is the TEST pattern.
23476 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23477 DAG.getConstant(0, dl, Op.getValueType()));
23478 }
23479 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23480 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23481
23482 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23483 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23484 return SDValue(New.getNode(), 1);
23485}
23486
23487/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23488/// equivalent.
23490 const SDLoc &dl, SelectionDAG &DAG,
23491 const X86Subtarget &Subtarget) {
23492 if (isNullConstant(Op1))
23493 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23494
23495 EVT CmpVT = Op0.getValueType();
23496
23497 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23498 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23499
23500 // Only promote the compare up to I32 if it is a 16 bit operation
23501 // with an immediate. 16 bit immediates are to be avoided unless the target
23502 // isn't slowed down by length changing prefixes, we're optimizing for
23503 // codesize or the comparison is with a folded load.
23504 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23505 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23507 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23508 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23509 // Don't do this if the immediate can fit in 8-bits.
23510 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23511 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23512 unsigned ExtendOp =
23514 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23515 // For equality comparisons try to use SIGN_EXTEND if the input was
23516 // truncate from something with enough sign bits.
23517 if (Op0.getOpcode() == ISD::TRUNCATE) {
23518 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23519 ExtendOp = ISD::SIGN_EXTEND;
23520 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23521 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23522 ExtendOp = ISD::SIGN_EXTEND;
23523 }
23524 }
23525
23526 CmpVT = MVT::i32;
23527 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23528 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23529 }
23530 }
23531
23532 // Try to shrink i64 compares if the input has enough zero bits.
23533 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23534 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23535 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23536 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23537 CmpVT = MVT::i32;
23538 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23539 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23540 }
23541
23542 // Try to shrink all i64 compares if the inputs are representable as signed
23543 // i32.
23544 if (CmpVT == MVT::i64 &&
23545 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23546 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23547 CmpVT = MVT::i32;
23548 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23549 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23550 }
23551
23552 // 0-x == y --> x+y == 0
23553 // 0-x != y --> x+y != 0
23554 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23555 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23556 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23557 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23558 return Add.getValue(1);
23559 }
23560
23561 // x == 0-y --> x+y == 0
23562 // x != 0-y --> x+y != 0
23563 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23564 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23565 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23566 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23567 return Add.getValue(1);
23568 }
23569
23570 // If we already have an XOR of the ops, use that to check for equality.
23571 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23572 unsigned X86Opc = X86ISD::SUB;
23573 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23574 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23575 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23576 X86Opc = X86ISD::XOR;
23577
23578 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23579 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23580 return CmpOp.getValue(1);
23581}
23582
23587
23588bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23589 SDNode *N, SDValue, SDValue IntPow2) const {
23590 if (N->getOpcode() == ISD::FDIV)
23591 return true;
23592
23593 EVT FPVT = N->getValueType(0);
23594 EVT IntVT = IntPow2.getValueType();
23595
23596 // This indicates a non-free bitcast.
23597 // TODO: This is probably overly conservative as we will need to scale the
23598 // integer vector anyways for the int->fp cast.
23599 if (FPVT.isVector() &&
23600 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23601 return false;
23602
23603 return true;
23604}
23605
23606/// Check if replacement of SQRT with RSQRT should be disabled.
23607bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23608 EVT VT = Op.getValueType();
23609
23610 // We don't need to replace SQRT with RSQRT for half type.
23611 if (VT.getScalarType() == MVT::f16)
23612 return true;
23613
23614 // We never want to use both SQRT and RSQRT instructions for the same input.
23615 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23616 return false;
23617
23618 if (VT.isVector())
23619 return Subtarget.hasFastVectorFSQRT();
23620 return Subtarget.hasFastScalarFSQRT();
23621}
23622
23623/// The minimum architected relative accuracy is 2^-12. We need one
23624/// Newton-Raphson step to have a good float result (24 bits of precision).
23625SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23626 SelectionDAG &DAG, int Enabled,
23627 int &RefinementSteps,
23628 bool &UseOneConstNR,
23629 bool Reciprocal) const {
23630 SDLoc DL(Op);
23631 EVT VT = Op.getValueType();
23632
23633 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23634 // It is likely not profitable to do this for f64 because a double-precision
23635 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23636 // instructions: convert to single, rsqrtss, convert back to double, refine
23637 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23638 // along with FMA, this could be a throughput win.
23639 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23640 // after legalize types.
23641 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23642 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23643 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23644 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23645 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23646 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23647 RefinementSteps = 1;
23648
23649 UseOneConstNR = false;
23650 // There is no FSQRT for 512-bits, but there is RSQRT14.
23651 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23652 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23653 if (RefinementSteps == 0 && !Reciprocal)
23654 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23655 return Estimate;
23656 }
23657
23658 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23659 Subtarget.hasFP16()) {
23660 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23661 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23662 RefinementSteps = 0;
23663
23664 if (VT == MVT::f16) {
23666 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23667 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23668 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23669 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23670 }
23671
23672 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23673 }
23674 return SDValue();
23675}
23676
23677/// The minimum architected relative accuracy is 2^-12. We need one
23678/// Newton-Raphson step to have a good float result (24 bits of precision).
23679SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23680 int Enabled,
23681 int &RefinementSteps) const {
23682 SDLoc DL(Op);
23683 EVT VT = Op.getValueType();
23684
23685 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23686 // It is likely not profitable to do this for f64 because a double-precision
23687 // reciprocal estimate with refinement on x86 prior to FMA requires
23688 // 15 instructions: convert to single, rcpss, convert back to double, refine
23689 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23690 // along with FMA, this could be a throughput win.
23691
23692 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23693 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23694 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23695 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23696 // Enable estimate codegen with 1 refinement step for vector division.
23697 // Scalar division estimates are disabled because they break too much
23698 // real-world code. These defaults are intended to match GCC behavior.
23699 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23700 return SDValue();
23701
23702 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23703 RefinementSteps = 1;
23704
23705 // There is no FSQRT for 512-bits, but there is RCP14.
23706 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23707 return DAG.getNode(Opcode, DL, VT, Op);
23708 }
23709
23710 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23711 Subtarget.hasFP16()) {
23712 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23713 RefinementSteps = 0;
23714
23715 if (VT == MVT::f16) {
23717 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23718 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23719 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23720 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23721 }
23722
23723 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23724 }
23725 return SDValue();
23726}
23727
23728/// If we have at least two divisions that use the same divisor, convert to
23729/// multiplication by a reciprocal. This may need to be adjusted for a given
23730/// CPU if a division's cost is not at least twice the cost of a multiplication.
23731/// This is because we still need one division to calculate the reciprocal and
23732/// then we need two multiplies by that reciprocal as replacements for the
23733/// original divisions.
23735 return 2;
23736}
23737
23738SDValue
23739X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23740 SelectionDAG &DAG,
23741 SmallVectorImpl<SDNode *> &Created) const {
23742 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23743 if (isIntDivCheap(N->getValueType(0), Attr))
23744 return SDValue(N,0); // Lower SDIV as SDIV
23745
23746 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23747 "Unexpected divisor!");
23748
23749 // Only perform this transform if CMOV is supported otherwise the select
23750 // below will become a branch.
23751 if (!Subtarget.canUseCMOV())
23752 return SDValue();
23753
23754 // fold (sdiv X, pow2)
23755 EVT VT = N->getValueType(0);
23756 // FIXME: Support i8.
23757 if (VT != MVT::i16 && VT != MVT::i32 &&
23758 !(Subtarget.is64Bit() && VT == MVT::i64))
23759 return SDValue();
23760
23761 // If the divisor is 2 or -2, the default expansion is better.
23762 if (Divisor == 2 ||
23763 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23764 return SDValue();
23765
23766 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23767}
23768
23769/// Result of 'and' is compared against zero. Change to a BT node if possible.
23770/// Returns the BT node and the condition code needed to use it.
23772 SelectionDAG &DAG, X86::CondCode &X86CC) {
23773 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23774 SDValue Op0 = And.getOperand(0);
23775 SDValue Op1 = And.getOperand(1);
23776 if (Op0.getOpcode() == ISD::TRUNCATE)
23777 Op0 = Op0.getOperand(0);
23778 if (Op1.getOpcode() == ISD::TRUNCATE)
23779 Op1 = Op1.getOperand(0);
23780
23781 SDValue Src, BitNo;
23782 if (Op1.getOpcode() == ISD::SHL)
23783 std::swap(Op0, Op1);
23784 if (Op0.getOpcode() == ISD::SHL) {
23785 if (isOneConstant(Op0.getOperand(0))) {
23786 // If we looked past a truncate, check that it's only truncating away
23787 // known zeros.
23788 unsigned BitWidth = Op0.getValueSizeInBits();
23789 unsigned AndBitWidth = And.getValueSizeInBits();
23790 if (BitWidth > AndBitWidth) {
23791 KnownBits Known = DAG.computeKnownBits(Op0);
23792 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23793 return SDValue();
23794 }
23795 Src = Op1;
23796 BitNo = Op0.getOperand(1);
23797 }
23798 } else if (Op1.getOpcode() == ISD::Constant) {
23799 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23800 uint64_t AndRHSVal = AndRHS->getZExtValue();
23801 SDValue AndLHS = Op0;
23802
23803 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23804 Src = AndLHS.getOperand(0);
23805 BitNo = AndLHS.getOperand(1);
23806 } else {
23807 // Use BT if the immediate can't be encoded in a TEST instruction or we
23808 // are optimizing for size and the immedaite won't fit in a byte.
23809 bool OptForSize = DAG.shouldOptForSize();
23810 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23811 isPowerOf2_64(AndRHSVal)) {
23812 Src = AndLHS;
23813 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23814 Src.getValueType());
23815 }
23816 }
23817 }
23818
23819 // No patterns found, give up.
23820 if (!Src.getNode())
23821 return SDValue();
23822
23823 // Remove any bit flip.
23824 if (isBitwiseNot(Src)) {
23825 Src = Src.getOperand(0);
23826 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23827 }
23828
23829 // Attempt to create the X86ISD::BT node.
23830 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23831 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23832 return BT;
23833 }
23834
23835 return SDValue();
23836}
23837
23838// Check if pre-AVX condcode can be performed by a single FCMP op.
23839static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23840 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23841}
23842
23843/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23844/// CMPs.
23845static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23846 SDValue &Op1, bool &IsAlwaysSignaling) {
23847 unsigned SSECC;
23848 bool Swap = false;
23849
23850 // SSE Condition code mapping:
23851 // 0 - EQ
23852 // 1 - LT
23853 // 2 - LE
23854 // 3 - UNORD
23855 // 4 - NEQ
23856 // 5 - NLT
23857 // 6 - NLE
23858 // 7 - ORD
23859 switch (SetCCOpcode) {
23860 // clang-format off
23861 default: llvm_unreachable("Unexpected SETCC condition");
23862 case ISD::SETOEQ:
23863 case ISD::SETEQ: SSECC = 0; break;
23864 case ISD::SETOGT:
23865 case ISD::SETGT: Swap = true; [[fallthrough]];
23866 case ISD::SETLT:
23867 case ISD::SETOLT: SSECC = 1; break;
23868 case ISD::SETOGE:
23869 case ISD::SETGE: Swap = true; [[fallthrough]];
23870 case ISD::SETLE:
23871 case ISD::SETOLE: SSECC = 2; break;
23872 case ISD::SETUO: SSECC = 3; break;
23873 case ISD::SETUNE:
23874 case ISD::SETNE: SSECC = 4; break;
23875 case ISD::SETULE: Swap = true; [[fallthrough]];
23876 case ISD::SETUGE: SSECC = 5; break;
23877 case ISD::SETULT: Swap = true; [[fallthrough]];
23878 case ISD::SETUGT: SSECC = 6; break;
23879 case ISD::SETO: SSECC = 7; break;
23880 case ISD::SETUEQ: SSECC = 8; break;
23881 case ISD::SETONE: SSECC = 12; break;
23882 // clang-format on
23883 }
23884 if (Swap)
23885 std::swap(Op0, Op1);
23886
23887 switch (SetCCOpcode) {
23888 default:
23889 IsAlwaysSignaling = true;
23890 break;
23891 case ISD::SETEQ:
23892 case ISD::SETOEQ:
23893 case ISD::SETUEQ:
23894 case ISD::SETNE:
23895 case ISD::SETONE:
23896 case ISD::SETUNE:
23897 case ISD::SETO:
23898 case ISD::SETUO:
23899 IsAlwaysSignaling = false;
23900 break;
23901 }
23902
23903 return SSECC;
23904}
23905
23906/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23907/// concatenate the result back.
23909 SelectionDAG &DAG, const SDLoc &dl) {
23910 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23911 "Unsupported VTs!");
23912 SDValue CC = DAG.getCondCode(Cond);
23913
23914 // Extract the LHS Lo/Hi vectors
23915 SDValue LHS1, LHS2;
23916 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23917
23918 // Extract the RHS Lo/Hi vectors
23919 SDValue RHS1, RHS2;
23920 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23921
23922 // Issue the operation on the smaller types and concatenate the result back
23923 EVT LoVT, HiVT;
23924 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23925 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23926 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23927 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23928}
23929
23931 SelectionDAG &DAG) {
23932 SDValue Op0 = Op.getOperand(0);
23933 SDValue Op1 = Op.getOperand(1);
23934 SDValue CC = Op.getOperand(2);
23935 MVT VT = Op.getSimpleValueType();
23936 assert(VT.getVectorElementType() == MVT::i1 &&
23937 "Cannot set masked compare for this operation");
23938
23939 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23940
23941 // Prefer SETGT over SETLT.
23942 if (SetCCOpcode == ISD::SETLT) {
23943 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23944 std::swap(Op0, Op1);
23945 }
23946
23947 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23948}
23949
23950/// Given a buildvector constant, return a new vector constant with each element
23951/// incremented or decremented. If incrementing or decrementing would result in
23952/// unsigned overflow or underflow or this is not a simple vector constant,
23953/// return an empty value.
23955 bool NSW) {
23956 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23957 if (!BV || !V.getValueType().isSimple())
23958 return SDValue();
23959
23960 MVT VT = V.getSimpleValueType();
23961 MVT EltVT = VT.getVectorElementType();
23962 unsigned NumElts = VT.getVectorNumElements();
23964 SDLoc DL(V);
23965 for (unsigned i = 0; i < NumElts; ++i) {
23966 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23967 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23968 return SDValue();
23969
23970 // Avoid overflow/underflow.
23971 const APInt &EltC = Elt->getAPIntValue();
23972 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23973 return SDValue();
23974 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23975 (!IsInc && EltC.isMinSignedValue())))
23976 return SDValue();
23977
23978 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23979 }
23980
23981 return DAG.getBuildVector(VT, DL, NewVecC);
23982}
23983
23984/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23985/// Op0 u<= Op1:
23986/// t = psubus Op0, Op1
23987/// pcmpeq t, <0..0>
23989 ISD::CondCode Cond, const SDLoc &dl,
23990 const X86Subtarget &Subtarget,
23991 SelectionDAG &DAG) {
23992 if (!Subtarget.hasSSE2())
23993 return SDValue();
23994
23995 MVT VET = VT.getVectorElementType();
23996 if (VET != MVT::i8 && VET != MVT::i16)
23997 return SDValue();
23998
23999 switch (Cond) {
24000 default:
24001 return SDValue();
24002 case ISD::SETULT: {
24003 // If the comparison is against a constant we can turn this into a
24004 // setule. With psubus, setule does not require a swap. This is
24005 // beneficial because the constant in the register is no longer
24006 // destructed as the destination so it can be hoisted out of a loop.
24007 // Only do this pre-AVX since vpcmp* is no longer destructive.
24008 if (Subtarget.hasAVX())
24009 return SDValue();
24010 SDValue ULEOp1 =
24011 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24012 if (!ULEOp1)
24013 return SDValue();
24014 Op1 = ULEOp1;
24015 break;
24016 }
24017 case ISD::SETUGT: {
24018 // If the comparison is against a constant, we can turn this into a setuge.
24019 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24020 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24021 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24022 SDValue UGEOp1 =
24023 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24024 if (!UGEOp1)
24025 return SDValue();
24026 Op1 = Op0;
24027 Op0 = UGEOp1;
24028 break;
24029 }
24030 // Psubus is better than flip-sign because it requires no inversion.
24031 case ISD::SETUGE:
24032 std::swap(Op0, Op1);
24033 break;
24034 case ISD::SETULE:
24035 break;
24036 }
24037
24038 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24039 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24040 DAG.getConstant(0, dl, VT));
24041}
24042
24043static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24044 SelectionDAG &DAG) {
24045 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24046 Op.getOpcode() == ISD::STRICT_FSETCCS;
24047 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24048 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24049 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24050 MVT VT = Op->getSimpleValueType(0);
24052 MVT OpVT = Op0.getSimpleValueType();
24053 SDLoc dl(Op);
24054
24055 if (OpVT.isFloatingPoint()) {
24056 MVT EltVT = OpVT.getVectorElementType();
24057 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24058 EltVT == MVT::f64);
24059
24060 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24061 if (isSoftF16(EltVT, Subtarget)) {
24062 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24063 return SDValue();
24064
24065 // Break 256-bit FP vector compare into smaller ones.
24066 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24067 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24068
24069 // Break 512-bit FP vector compare into smaller ones.
24070 if (OpVT.is512BitVector())
24071 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24072
24073 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24074 if (IsStrict) {
24075 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24076 {Chain, Op0});
24077 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24078 {Chain, Op1});
24079 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24080 {Chain, Op0, Op1, CC});
24081 }
24082 MVT DVT = VT.getVectorElementType() == MVT::i16
24083 ? VT.changeVectorElementType(MVT::i32)
24084 : VT;
24085 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24086 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24087 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24088 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24089 }
24090
24091 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24092
24093 // If we have a strict compare with a vXi1 result and the input is 128/256
24094 // bits we can't use a masked compare unless we have VLX. If we use a wider
24095 // compare like we do for non-strict, we might trigger spurious exceptions
24096 // from the upper elements. Instead emit a AVX compare and convert to mask.
24097 unsigned Opc;
24098 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24099 (!IsStrict || Subtarget.hasVLX() ||
24101#ifndef NDEBUG
24102 unsigned Num = VT.getVectorNumElements();
24103 assert(Num <= 16 ||
24104 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24105#endif
24106 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24107 } else {
24108 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24109 // The SSE/AVX packed FP comparison nodes are defined with a
24110 // floating-point vector result that matches the operand type. This allows
24111 // them to work with an SSE1 target (integer vector types are not legal).
24112 VT = Op0.getSimpleValueType();
24113 }
24114
24115 SDValue Cmp;
24116 bool IsAlwaysSignaling;
24117 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24118 if (!Subtarget.hasAVX()) {
24119 // TODO: We could use following steps to handle a quiet compare with
24120 // signaling encodings.
24121 // 1. Get ordered masks from a quiet ISD::SETO
24122 // 2. Use the masks to mask potential unordered elements in operand A, B
24123 // 3. Get the compare results of masked A, B
24124 // 4. Calculating final result using the mask and result from 3
24125 // But currently, we just fall back to scalar operations.
24126 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24127 return SDValue();
24128
24129 // Insert an extra signaling instruction to raise exception.
24130 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24131 SDValue SignalCmp = DAG.getNode(
24132 Opc, dl, {VT, MVT::Other},
24133 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24134 // FIXME: It seems we need to update the flags of all new strict nodes.
24135 // Otherwise, mayRaiseFPException in MI will return false due to
24136 // NoFPExcept = false by default. However, I didn't find it in other
24137 // patches.
24138 SignalCmp->setFlags(Op->getFlags());
24139 Chain = SignalCmp.getValue(1);
24140 }
24141
24142 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24143 // emit two comparisons and a logic op to tie them together.
24144 if (!cheapX86FSETCC_SSE(Cond)) {
24145 // LLVM predicate is SETUEQ or SETONE.
24146 unsigned CC0, CC1;
24147 unsigned CombineOpc;
24148 if (Cond == ISD::SETUEQ) {
24149 CC0 = 3; // UNORD
24150 CC1 = 0; // EQ
24151 CombineOpc = X86ISD::FOR;
24152 } else {
24154 CC0 = 7; // ORD
24155 CC1 = 4; // NEQ
24156 CombineOpc = X86ISD::FAND;
24157 }
24158
24159 SDValue Cmp0, Cmp1;
24160 if (IsStrict) {
24161 Cmp0 = DAG.getNode(
24162 Opc, dl, {VT, MVT::Other},
24163 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24164 Cmp1 = DAG.getNode(
24165 Opc, dl, {VT, MVT::Other},
24166 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24167 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24168 Cmp1.getValue(1));
24169 } else {
24170 Cmp0 = DAG.getNode(
24171 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24172 Cmp1 = DAG.getNode(
24173 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24174 }
24175 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24176 } else {
24177 if (IsStrict) {
24178 Cmp = DAG.getNode(
24179 Opc, dl, {VT, MVT::Other},
24180 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24181 Chain = Cmp.getValue(1);
24182 } else
24183 Cmp = DAG.getNode(
24184 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24185 }
24186 } else {
24187 // Handle all other FP comparisons here.
24188 if (IsStrict) {
24189 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24190 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24191 Cmp = DAG.getNode(
24192 Opc, dl, {VT, MVT::Other},
24193 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24194 Chain = Cmp.getValue(1);
24195 } else
24196 Cmp = DAG.getNode(
24197 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24198 }
24199
24200 if (VT.getFixedSizeInBits() >
24201 Op.getSimpleValueType().getFixedSizeInBits()) {
24202 // We emitted a compare with an XMM/YMM result. Finish converting to a
24203 // mask register using a vptestm.
24205 Cmp = DAG.getBitcast(CastVT, Cmp);
24206 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24207 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24208 } else {
24209 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24210 // the result type of SETCC. The bitcast is expected to be optimized
24211 // away during combining/isel.
24212 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24213 }
24214
24215 if (IsStrict)
24216 return DAG.getMergeValues({Cmp, Chain}, dl);
24217
24218 return Cmp;
24219 }
24220
24221 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24222
24223 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24224 assert(VTOp0 == Op1.getSimpleValueType() &&
24225 "Expected operands with same type!");
24227 "Invalid number of packed elements for source and destination!");
24228
24229 // The non-AVX512 code below works under the assumption that source and
24230 // destination types are the same.
24231 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24232 "Value types for source and destination must be the same!");
24233
24234 // The result is boolean, but operands are int/float
24235 if (VT.getVectorElementType() == MVT::i1) {
24236 // In AVX-512 architecture setcc returns mask with i1 elements,
24237 // But there is no compare instruction for i8 and i16 elements in KNL.
24238 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24239 "Unexpected operand type");
24240 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24241 }
24242
24243 // Lower using XOP integer comparisons.
24244 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24245 // Translate compare code to XOP PCOM compare mode.
24246 unsigned CmpMode = 0;
24247 switch (Cond) {
24248 // clang-format off
24249 default: llvm_unreachable("Unexpected SETCC condition");
24250 case ISD::SETULT:
24251 case ISD::SETLT: CmpMode = 0x00; break;
24252 case ISD::SETULE:
24253 case ISD::SETLE: CmpMode = 0x01; break;
24254 case ISD::SETUGT:
24255 case ISD::SETGT: CmpMode = 0x02; break;
24256 case ISD::SETUGE:
24257 case ISD::SETGE: CmpMode = 0x03; break;
24258 case ISD::SETEQ: CmpMode = 0x04; break;
24259 case ISD::SETNE: CmpMode = 0x05; break;
24260 // clang-format on
24261 }
24262
24263 // Are we comparing unsigned or signed integers?
24264 unsigned Opc =
24266
24267 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24268 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24269 }
24270
24271 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24272 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24274 SDValue BC0 = peekThroughBitcasts(Op0);
24275 if (BC0.getOpcode() == ISD::AND &&
24277 /*AllowUndefs=*/false)) {
24278 Cond = ISD::SETEQ;
24279 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24280 }
24281 }
24282
24283 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24284 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24285 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24287 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24288 unsigned BitWidth = VT.getScalarSizeInBits();
24289 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24290
24291 SDValue Result = Op0.getOperand(0);
24292 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24293 DAG.getConstant(ShiftAmt, dl, VT));
24294 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24295 DAG.getConstant(BitWidth - 1, dl, VT));
24296 return Result;
24297 }
24298 }
24299
24300 // Break 256-bit integer vector compare into smaller ones.
24301 if (VT.is256BitVector() && !Subtarget.hasInt256())
24302 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24303
24304 // Break 512-bit integer vector compare into smaller ones.
24305 // TODO: Try harder to use VPCMPx + VPMOV2x?
24306 if (VT.is512BitVector())
24307 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24308
24309 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24310 // not-of-PCMPEQ:
24311 // X != INT_MIN --> X >s INT_MIN
24312 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24313 // +X != 0 --> +X >s 0
24314 APInt ConstValue;
24315 if (Cond == ISD::SETNE &&
24316 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24317 if (ConstValue.isMinSignedValue())
24318 Cond = ISD::SETGT;
24319 else if (ConstValue.isMaxSignedValue())
24320 Cond = ISD::SETLT;
24321 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24322 Cond = ISD::SETGT;
24323 }
24324
24325 // If both operands are known non-negative, then an unsigned compare is the
24326 // same as a signed compare and there's no need to flip signbits.
24327 // TODO: We could check for more general simplifications here since we're
24328 // computing known bits.
24329 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24330 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24331
24332 // Special case: Use min/max operations for unsigned compares.
24333 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24335 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24336 TLI.isOperationLegal(ISD::UMIN, VT)) {
24337 // If we have a constant operand, increment/decrement it and change the
24338 // condition to avoid an invert.
24339 if (Cond == ISD::SETUGT) {
24340 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24341 if (SDValue UGTOp1 =
24342 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24343 Op1 = UGTOp1;
24344 Cond = ISD::SETUGE;
24345 }
24346 }
24347 if (Cond == ISD::SETULT) {
24348 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24349 if (SDValue ULTOp1 =
24350 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24351 Op1 = ULTOp1;
24352 Cond = ISD::SETULE;
24353 }
24354 }
24355 bool Invert = false;
24356 unsigned Opc;
24357 switch (Cond) {
24358 // clang-format off
24359 default: llvm_unreachable("Unexpected condition code");
24360 case ISD::SETUGT: Invert = true; [[fallthrough]];
24361 case ISD::SETULE: Opc = ISD::UMIN; break;
24362 case ISD::SETULT: Invert = true; [[fallthrough]];
24363 case ISD::SETUGE: Opc = ISD::UMAX; break;
24364 // clang-format on
24365 }
24366
24367 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24368 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24369
24370 // If the logical-not of the result is required, perform that now.
24371 if (Invert)
24372 Result = DAG.getNOT(dl, Result, VT);
24373
24374 return Result;
24375 }
24376
24377 // Try to use SUBUS and PCMPEQ.
24378 if (FlipSigns)
24379 if (SDValue V =
24380 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24381 return V;
24382
24383 // We are handling one of the integer comparisons here. Since SSE only has
24384 // GT and EQ comparisons for integer, swapping operands and multiple
24385 // operations may be required for some comparisons.
24386 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24388 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24390 bool Invert = Cond == ISD::SETNE ||
24392
24393 if (Swap)
24394 std::swap(Op0, Op1);
24395
24396 // Check that the operation in question is available (most are plain SSE2,
24397 // but PCMPGTQ and PCMPEQQ have different requirements).
24398 if (VT == MVT::v2i64) {
24399 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24400 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24401
24402 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24403 // the odd elements over the even elements.
24404 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24405 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24406 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24407
24408 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24409 static const int MaskHi[] = { 1, 1, 3, 3 };
24410 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24411
24412 return DAG.getBitcast(VT, Result);
24413 }
24414
24415 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24416 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24417 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24418
24419 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24420 static const int MaskHi[] = { 1, 1, 3, 3 };
24421 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24422
24423 return DAG.getBitcast(VT, Result);
24424 }
24425
24426 // If the i64 elements are sign-extended enough to be representable as i32
24427 // then we can compare the lower i32 bits and splat.
24428 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24429 DAG.ComputeNumSignBits(Op1) > 32) {
24430 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24431 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24432
24433 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24434 static const int MaskLo[] = {0, 0, 2, 2};
24435 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24436
24437 return DAG.getBitcast(VT, Result);
24438 }
24439
24440 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24441 // bits of the inputs before performing those operations. The lower
24442 // compare is always unsigned.
24443 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24444 : 0x0000000080000000ULL,
24445 dl, MVT::v2i64);
24446
24447 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24448 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24449
24450 // Cast everything to the right type.
24451 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24452 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24453
24454 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24455 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24456 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24457
24458 // Create masks for only the low parts/high parts of the 64 bit integers.
24459 static const int MaskHi[] = { 1, 1, 3, 3 };
24460 static const int MaskLo[] = { 0, 0, 2, 2 };
24461 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24462 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24463 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24464
24465 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24466 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24467
24468 if (Invert)
24469 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24470
24471 return DAG.getBitcast(VT, Result);
24472 }
24473
24474 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24475 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24476 // pcmpeqd + pshufd + pand.
24477 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24478
24479 // First cast everything to the right type.
24480 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24481 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24482
24483 // Do the compare.
24484 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24485
24486 // Make sure the lower and upper halves are both all-ones.
24487 static const int Mask[] = { 1, 0, 3, 2 };
24488 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24489 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24490
24491 if (Invert)
24492 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24493
24494 return DAG.getBitcast(VT, Result);
24495 }
24496 }
24497
24498 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24499 // bits of the inputs before performing those operations.
24500 if (FlipSigns) {
24501 MVT EltVT = VT.getVectorElementType();
24503 VT);
24504 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24505 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24506 }
24507
24508 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24509
24510 // If the logical-not of the result is required, perform that now.
24511 if (Invert)
24512 Result = DAG.getNOT(dl, Result, VT);
24513
24514 return Result;
24515}
24516
24517// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24519 const SDLoc &dl, SelectionDAG &DAG,
24520 const X86Subtarget &Subtarget,
24521 SDValue &X86CC) {
24522 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24523
24524 // Must be a bitcast from vXi1.
24525 if (Op0.getOpcode() != ISD::BITCAST)
24526 return SDValue();
24527
24528 Op0 = Op0.getOperand(0);
24529 MVT VT = Op0.getSimpleValueType();
24530 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24531 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24532 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24533 return SDValue();
24534
24535 X86::CondCode X86Cond;
24536 if (isNullConstant(Op1)) {
24537 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24538 } else if (isAllOnesConstant(Op1)) {
24539 // C flag is set for all ones.
24540 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24541 } else
24542 return SDValue();
24543
24544 // If the input is an AND, we can combine it's operands into the KTEST.
24545 bool KTestable = false;
24546 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24547 KTestable = true;
24548 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24549 KTestable = true;
24550 if (!isNullConstant(Op1))
24551 KTestable = false;
24552 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24553 SDValue LHS = Op0.getOperand(0);
24554 SDValue RHS = Op0.getOperand(1);
24555 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24556 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24557 }
24558
24559 // If the input is an OR, we can combine it's operands into the KORTEST.
24560 SDValue LHS = Op0;
24561 SDValue RHS = Op0;
24562 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24563 LHS = Op0.getOperand(0);
24564 RHS = Op0.getOperand(1);
24565 }
24566
24567 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24568 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24569}
24570
24571/// Emit flags for the given setcc condition and operands. Also returns the
24572/// corresponding X86 condition code constant in X86CC.
24573SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24574 ISD::CondCode CC, const SDLoc &dl,
24575 SelectionDAG &DAG,
24576 SDValue &X86CC) const {
24577 // Equality Combines.
24578 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24579 X86::CondCode X86CondCode;
24580
24581 // Optimize to BT if possible.
24582 // Lower (X & (1 << N)) == 0 to BT(X, N).
24583 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24584 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24585 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24586 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24587 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24588 return BT;
24589 }
24590 }
24591
24592 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24593 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24594 X86CondCode)) {
24595 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24596 return CmpZ;
24597 }
24598
24599 // Try to lower using KORTEST or KTEST.
24600 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24601 return Test;
24602
24603 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24604 // of these.
24605 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24606 // If the input is a setcc, then reuse the input setcc or use a new one
24607 // with the inverted condition.
24608 if (Op0.getOpcode() == X86ISD::SETCC) {
24609 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24610
24611 X86CC = Op0.getOperand(0);
24612 if (Invert) {
24613 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24614 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24615 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24616 }
24617
24618 return Op0.getOperand(1);
24619 }
24620 }
24621
24622 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24623 // overflow.
24624 if (isMinSignedConstant(Op1)) {
24625 EVT VT = Op0.getValueType();
24626 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24627 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24629 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24630 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24631 DAG.getConstant(0, dl, VT), Op0);
24632 return SDValue(Neg.getNode(), 1);
24633 }
24634 }
24635
24636 // Try to use the carry flag from the add in place of an separate CMP for:
24637 // (seteq (add X, -1), -1). Similar for setne.
24638 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24639 Op0.getOperand(1) == Op1) {
24640 if (isProfitableToUseFlagOp(Op0)) {
24641 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24642
24643 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24644 Op0.getOperand(1));
24645 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24646 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24647 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24648 return SDValue(New.getNode(), 1);
24649 }
24650 }
24651 }
24652
24654 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24655 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24656
24657 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24658 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24659 return EFLAGS;
24660}
24661
24662SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24663
24664 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24665 Op.getOpcode() == ISD::STRICT_FSETCCS;
24666 MVT VT = Op->getSimpleValueType(0);
24667
24668 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24669
24670 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24671 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24672 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24673 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24674 SDLoc dl(Op);
24675 ISD::CondCode CC =
24676 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24677
24678 if (isSoftF16(Op0.getValueType(), Subtarget))
24679 return SDValue();
24680
24681 // Handle f128 first, since one possible outcome is a normal integer
24682 // comparison which gets handled by emitFlagsForSetcc.
24683 if (Op0.getValueType() == MVT::f128) {
24684 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24685 Op.getOpcode() == ISD::STRICT_FSETCCS);
24686
24687 // If softenSetCCOperands returned a scalar, use it.
24688 if (!Op1.getNode()) {
24689 assert(Op0.getValueType() == Op.getValueType() &&
24690 "Unexpected setcc expansion!");
24691 if (IsStrict)
24692 return DAG.getMergeValues({Op0, Chain}, dl);
24693 return Op0;
24694 }
24695 }
24696
24697 if (Op0.getSimpleValueType().isInteger()) {
24698 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24699 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24700 // this may translate to less uops depending on uarch implementation. The
24701 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24702 // canonicalize to that CondCode.
24703 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24704 // encoding size - so it must either already be a i8 or i32 immediate, or it
24705 // shrinks down to that. We don't do this for any i64's to avoid additional
24706 // constant materializations.
24707 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24708 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24709 const APInt &Op1Val = Op1C->getAPIntValue();
24710 if (!Op1Val.isZero()) {
24711 // Ensure the constant+1 doesn't overflow.
24712 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24713 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24714 APInt Op1ValPlusOne = Op1Val + 1;
24715 if (Op1ValPlusOne.isSignedIntN(32) &&
24716 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24717 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24720 }
24721 }
24722 }
24723 }
24724
24725 SDValue X86CC;
24726 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24727 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24728 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24729 }
24730
24731 if (Subtarget.hasAVX10_2()) {
24732 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24733 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24734 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24735 if (Op0.getSimpleValueType() != MVT::f80) {
24736 SDValue Res = getSETCC(
24737 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24738 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24739 }
24740 }
24741 }
24742 // Handle floating point.
24743 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24744 if (CondCode == X86::COND_INVALID)
24745 return SDValue();
24746
24747 SDValue EFLAGS;
24748 if (IsStrict) {
24749 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24750 EFLAGS =
24752 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24753 Chain = EFLAGS.getValue(1);
24754 } else {
24755 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24756 }
24757
24758 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24759 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24760 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24761}
24762
24763SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24764 SDValue LHS = Op.getOperand(0);
24765 SDValue RHS = Op.getOperand(1);
24766 SDValue Carry = Op.getOperand(2);
24767 SDValue Cond = Op.getOperand(3);
24768 SDLoc DL(Op);
24769
24770 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24772
24773 // Recreate the carry if needed.
24774 EVT CarryVT = Carry.getValueType();
24775 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24776 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24777
24778 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24779 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24780 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24781}
24782
24783// This function returns three things: the arithmetic computation itself
24784// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24785// flag and the condition code define the case in which the arithmetic
24786// computation overflows.
24787static std::pair<SDValue, SDValue>
24789 assert(Op.getResNo() == 0 && "Unexpected result number!");
24790 SDValue Value, Overflow;
24791 SDValue LHS = Op.getOperand(0);
24792 SDValue RHS = Op.getOperand(1);
24793 unsigned BaseOp = 0;
24794 SDLoc DL(Op);
24795 switch (Op.getOpcode()) {
24796 default: llvm_unreachable("Unknown ovf instruction!");
24797 case ISD::SADDO:
24798 BaseOp = X86ISD::ADD;
24799 Cond = X86::COND_O;
24800 break;
24801 case ISD::UADDO:
24802 BaseOp = X86ISD::ADD;
24804 break;
24805 case ISD::SSUBO:
24806 BaseOp = X86ISD::SUB;
24807 Cond = X86::COND_O;
24808 break;
24809 case ISD::USUBO:
24810 BaseOp = X86ISD::SUB;
24811 Cond = X86::COND_B;
24812 break;
24813 case ISD::SMULO:
24814 BaseOp = X86ISD::SMUL;
24815 Cond = X86::COND_O;
24816 break;
24817 case ISD::UMULO:
24818 BaseOp = X86ISD::UMUL;
24819 Cond = X86::COND_O;
24820 break;
24821 }
24822
24823 if (BaseOp) {
24824 // Also sets EFLAGS.
24825 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24826 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24827 Overflow = Value.getValue(1);
24828 }
24829
24830 return std::make_pair(Value, Overflow);
24831}
24832
24834 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24835 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24836 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24837 // has only one use.
24838 SDLoc DL(Op);
24840 SDValue Value, Overflow;
24841 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24842
24843 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24844 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24845 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24846}
24847
24848/// Return true if opcode is a X86 logical comparison.
24850 unsigned Opc = Op.getOpcode();
24851 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24852 Opc == X86ISD::FCMP)
24853 return true;
24854 if (Op.getResNo() == 1 &&
24855 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24857 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24858 return true;
24859
24860 return false;
24861}
24862
24864 if (V.getOpcode() != ISD::TRUNCATE)
24865 return false;
24866
24867 SDValue VOp0 = V.getOperand(0);
24868 unsigned InBits = VOp0.getValueSizeInBits();
24869 unsigned Bits = V.getValueSizeInBits();
24870 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24871}
24872
24873// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24875 unsigned X86CC, const SDLoc &DL,
24876 SelectionDAG &DAG,
24877 const X86Subtarget &Subtarget) {
24878 EVT CmpVT = CmpVal.getValueType();
24879 EVT VT = LHS.getValueType();
24880 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24881 return SDValue();
24882
24883 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24884 isOneConstant(CmpVal.getOperand(1))) {
24885 auto SplatLSB = [&](EVT SplatVT) {
24886 // we need mask of all zeros or ones with same size of the other
24887 // operands.
24888 SDValue Neg = CmpVal;
24889 if (CmpVT.bitsGT(SplatVT))
24890 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24891 else if (CmpVT.bitsLT(SplatVT))
24892 Neg = DAG.getNode(
24893 ISD::AND, DL, SplatVT,
24894 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24895 DAG.getConstant(1, DL, SplatVT));
24896 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24897 };
24898
24899 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24901 return SplatLSB(VT);
24902
24903 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24904 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24906 SDValue Mask = SplatLSB(VT);
24907 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24908 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24909 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24910 }
24911
24912 SDValue Src1, Src2;
24913 auto isIdentityPatternZero = [&]() {
24914 switch (RHS.getOpcode()) {
24915 default:
24916 break;
24917 case ISD::OR:
24918 case ISD::XOR:
24919 case ISD::ADD:
24920 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24921 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24922 Src2 = LHS;
24923 return true;
24924 }
24925 break;
24926 case ISD::SHL:
24927 case ISD::SRA:
24928 case ISD::SRL:
24929 case ISD::SUB:
24930 if (RHS.getOperand(0) == LHS) {
24931 Src1 = RHS.getOperand(1);
24932 Src2 = LHS;
24933 return true;
24934 }
24935 break;
24936 }
24937 return false;
24938 };
24939
24940 auto isIdentityPatternOnes = [&]() {
24941 switch (LHS.getOpcode()) {
24942 default:
24943 break;
24944 case ISD::AND:
24945 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24946 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24947 Src2 = RHS;
24948 return true;
24949 }
24950 break;
24951 }
24952 return false;
24953 };
24954
24955 // Convert 'identity' patterns (iff X is 0 or 1):
24956 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24960 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24961 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24962 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24963 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24964 SDValue Mask = SplatLSB(Src1.getValueType());
24965 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24966 Src1); // Mask & z
24967 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24968 }
24969 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24970 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24971 SDValue Mask = SplatLSB(VT);
24972 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24973 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24974 }
24975 }
24976
24977 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24980 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24981
24982 // 'X - 1' sets the carry flag if X == 0.
24983 // '0 - X' sets the carry flag if X != 0.
24984 // Convert the carry flag to a -1/0 mask with sbb:
24985 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24986 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24987 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24988 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24989 SDValue Sub;
24990 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24991 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24992 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24993 } else {
24994 SDValue One = DAG.getConstant(1, DL, CmpVT);
24995 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24996 }
24997 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24998 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24999 Sub.getValue(1));
25000 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
25001 }
25002
25003 return SDValue();
25004}
25005
25006SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25007 bool AddTest = true;
25008 SDValue Cond = Op.getOperand(0);
25009 SDValue Op1 = Op.getOperand(1);
25010 SDValue Op2 = Op.getOperand(2);
25011 SDLoc DL(Op);
25012 MVT VT = Op1.getSimpleValueType();
25013 SDValue CC;
25014
25015 if (isSoftF16(VT, Subtarget)) {
25016 MVT NVT = VT.changeTypeToInteger();
25017 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25018 DAG.getBitcast(NVT, Op1),
25019 DAG.getBitcast(NVT, Op2)));
25020 }
25021
25022 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25023 // are available or VBLENDV if AVX is available.
25024 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25025 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25026 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25027 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25028 bool IsAlwaysSignaling;
25029 unsigned SSECC =
25030 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25031 CondOp0, CondOp1, IsAlwaysSignaling);
25032
25033 if (Subtarget.hasAVX512()) {
25034 SDValue Cmp =
25035 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25036 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25037 assert(!VT.isVector() && "Not a scalar type?");
25038 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25039 }
25040
25041 if (SSECC < 8 || Subtarget.hasAVX()) {
25042 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25043 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25044
25045 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25046 // instead of 3 logic instructions for size savings and potentially speed.
25047 // Unfortunately, there is no scalar form of VBLENDV.
25048 //
25049 // If either operand is a +0.0 constant, don't try this. We can expect to
25050 // optimize away at least one of the logic instructions later in that
25051 // case, so that sequence would be faster than a variable blend.
25052 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25053 !isNullFPConstant(Op2)) {
25054 // Convert to vectors, do a VSELECT, and convert back to scalar.
25055 // All of the conversions should be optimized away.
25056 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25057 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25058 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25059 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25060
25061 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25062 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25063
25064 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25065
25066 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25067 DAG.getVectorIdxConstant(0, DL));
25068 }
25069 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25070 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25071 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25072 }
25073 }
25074
25075 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25076 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25077 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25078 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25079 }
25080
25081 if (Cond.getOpcode() == ISD::SETCC &&
25082 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25083 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25084 Cond = NewCond;
25085 // If the condition was updated, it's possible that the operands of the
25086 // select were also updated (for example, EmitTest has a RAUW). Refresh
25087 // the local references to the select operands in case they got stale.
25088 Op1 = Op.getOperand(1);
25089 Op2 = Op.getOperand(2);
25090 }
25091 }
25092
25093 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25094 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25095 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25096 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25097 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25098 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25099 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25100 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25101 if (Cond.getOpcode() == X86ISD::SETCC &&
25102 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25103 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25104 SDValue Cmp = Cond.getOperand(1);
25105 SDValue CmpOp0 = Cmp.getOperand(0);
25106 unsigned CondCode = Cond.getConstantOperandVal(0);
25107
25108 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25109 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25110 // handle to keep the CMP with 0. This should be removed by
25111 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25112 // cttz_zero_undef.
25113 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25114 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25115 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25116 };
25117 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25118 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25119 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25120 // Keep Cmp.
25121 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25122 DL, DAG, Subtarget)) {
25123 return R;
25124 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25125 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25126 ((CondCode == X86::COND_S) || // smin(x, 0)
25127 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25128 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25129 //
25130 // If the comparison is testing for a positive value, we have to invert
25131 // the sign bit mask, so only do that transform if the target has a
25132 // bitwise 'and not' instruction (the invert is free).
25133 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25134 unsigned ShCt = VT.getSizeInBits() - 1;
25135 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25136 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25137 if (CondCode == X86::COND_G)
25138 Shift = DAG.getNOT(DL, Shift, VT);
25139 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25140 }
25141 }
25142
25143 // Look past (and (setcc_carry (cmp ...)), 1).
25144 if (Cond.getOpcode() == ISD::AND &&
25145 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25146 isOneConstant(Cond.getOperand(1)))
25147 Cond = Cond.getOperand(0);
25148
25149 // Attempt to fold "raw cond" cases by treating them as:
25150 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25151 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25152 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25153 Subtarget))
25154 return R;
25155
25156 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25157 // setting operand in place of the X86ISD::SETCC.
25158 unsigned CondOpcode = Cond.getOpcode();
25159 if (CondOpcode == X86ISD::SETCC ||
25160 CondOpcode == X86ISD::SETCC_CARRY) {
25161 CC = Cond.getOperand(0);
25162
25163 SDValue Cmp = Cond.getOperand(1);
25164 bool IllegalFPCMov = false;
25165 if (VT.isFloatingPoint() && !VT.isVector() &&
25166 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25167 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25168
25169 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25170 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25171 Cond = Cmp;
25172 AddTest = false;
25173 }
25174 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25175 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25176 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25177 SDValue Value;
25178 X86::CondCode X86Cond;
25179 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25180
25181 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25182 AddTest = false;
25183 }
25184
25185 if (AddTest) {
25186 // Look past the truncate if the high bits are known zero.
25188 Cond = Cond.getOperand(0);
25189
25190 // We know the result of AND is compared against zero. Try to match
25191 // it to BT.
25192 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25193 X86::CondCode X86CondCode;
25194 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25195 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25196 Cond = BT;
25197 AddTest = false;
25198 }
25199 }
25200 }
25201
25202 if (AddTest) {
25203 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25204 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25205 }
25206
25207 // a < b ? -1 : 0 -> RES = ~setcc_carry
25208 // a < b ? 0 : -1 -> RES = setcc_carry
25209 // a >= b ? -1 : 0 -> RES = setcc_carry
25210 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25211 if (Cond.getOpcode() == X86ISD::SUB) {
25212 unsigned CondCode = CC->getAsZExtVal();
25213
25214 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25215 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25216 (isNullConstant(Op1) || isNullConstant(Op2))) {
25217 SDValue Res =
25218 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25219 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25220 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25221 return DAG.getNOT(DL, Res, Res.getValueType());
25222 return Res;
25223 }
25224 }
25225
25226 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25227 // widen the cmov and push the truncate through. This avoids introducing a new
25228 // branch during isel and doesn't add any extensions.
25229 if (Op.getValueType() == MVT::i8 &&
25230 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25231 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25232 if (T1.getValueType() == T2.getValueType() &&
25233 // Exclude CopyFromReg to avoid partial register stalls.
25234 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25235 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25236 CC, Cond);
25237 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25238 }
25239 }
25240
25241 // Or finally, promote i8 cmovs if we have CMOV,
25242 // or i16 cmovs if it won't prevent folding a load.
25243 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25244 // legal, but EmitLoweredSelect() can not deal with these extensions
25245 // being inserted between two CMOV's. (in i16 case too TBN)
25246 // https://bugs.llvm.org/show_bug.cgi?id=40974
25247 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25248 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25249 !X86::mayFoldLoad(Op2, Subtarget))) {
25250 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25251 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25252 SDValue Ops[] = { Op2, Op1, CC, Cond };
25253 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25254 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25255 }
25256
25257 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25258 // condition is true.
25259 SDValue Ops[] = { Op2, Op1, CC, Cond };
25260 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25261}
25262
25264 const X86Subtarget &Subtarget,
25265 SelectionDAG &DAG) {
25266 MVT VT = Op->getSimpleValueType(0);
25267 SDValue In = Op->getOperand(0);
25268 MVT InVT = In.getSimpleValueType();
25269 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25270 MVT VTElt = VT.getVectorElementType();
25271 unsigned NumElts = VT.getVectorNumElements();
25272
25273 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25274 MVT ExtVT = VT;
25275 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25276 // If v16i32 is to be avoided, we'll need to split and concatenate.
25277 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25278 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25279
25280 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25281 }
25282
25283 // Widen to 512-bits if VLX is not supported.
25284 MVT WideVT = ExtVT;
25285 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25286 NumElts *= 512 / ExtVT.getSizeInBits();
25287 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25288 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25289 DAG.getVectorIdxConstant(0, dl));
25290 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25291 }
25292
25293 SDValue V;
25294 MVT WideEltVT = WideVT.getVectorElementType();
25295 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25296 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25297 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25298 } else {
25299 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25300 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25301 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25302 }
25303
25304 // Truncate if we had to extend i16/i8 above.
25305 if (VT != ExtVT) {
25306 WideVT = MVT::getVectorVT(VTElt, NumElts);
25307 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25308 }
25309
25310 // Extract back to 128/256-bit if we widened.
25311 if (WideVT != VT)
25312 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25313 DAG.getVectorIdxConstant(0, dl));
25314
25315 return V;
25316}
25317
25319 SelectionDAG &DAG) {
25320 SDValue In = Op->getOperand(0);
25321 MVT InVT = In.getSimpleValueType();
25322 SDLoc DL(Op);
25323
25324 if (InVT.getVectorElementType() == MVT::i1)
25325 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25326
25327 assert(Subtarget.hasAVX() && "Expected AVX support");
25328 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25329}
25330
25331// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25332// For sign extend this needs to handle all vector sizes and SSE4.1 and
25333// non-SSE4.1 targets. For zero extend this should only handle inputs of
25334// MVT::v64i8 when BWI is not supported, but AVX512 is.
25336 const X86Subtarget &Subtarget,
25337 SelectionDAG &DAG) {
25338 SDValue In = Op->getOperand(0);
25339 MVT VT = Op->getSimpleValueType(0);
25340 MVT InVT = In.getSimpleValueType();
25341
25342 MVT SVT = VT.getVectorElementType();
25343 MVT InSVT = InVT.getVectorElementType();
25345
25346 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25347 return SDValue();
25348 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25349 return SDValue();
25350 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25351 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25352 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25353 return SDValue();
25354
25355 SDLoc dl(Op);
25356 unsigned Opc = Op.getOpcode();
25357 unsigned NumElts = VT.getVectorNumElements();
25358
25359 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25360 // For 512-bit vectors, we need 128-bits or 256-bits.
25361 if (InVT.getSizeInBits() > 128) {
25362 // Input needs to be at least the same number of elements as output, and
25363 // at least 128-bits.
25364 int InSize = InSVT.getSizeInBits() * NumElts;
25365 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25366 InVT = In.getSimpleValueType();
25367 }
25368
25369 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25370 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25371 // need to be handled here for 256/512-bit results.
25372 if (Subtarget.hasInt256()) {
25373 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25374
25375 if (InVT.getVectorNumElements() != NumElts)
25376 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25377
25378 // FIXME: Apparently we create inreg operations that could be regular
25379 // extends.
25380 unsigned ExtOpc =
25383 return DAG.getNode(ExtOpc, dl, VT, In);
25384 }
25385
25386 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25387 if (Subtarget.hasAVX()) {
25388 assert(VT.is256BitVector() && "256-bit vector expected");
25389 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25390 int HalfNumElts = HalfVT.getVectorNumElements();
25391
25392 unsigned NumSrcElts = InVT.getVectorNumElements();
25393 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25394 for (int i = 0; i != HalfNumElts; ++i)
25395 HiMask[i] = HalfNumElts + i;
25396
25397 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25398 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25399 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25400 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25401 }
25402
25403 // We should only get here for sign extend.
25404 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25405 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25406 unsigned InNumElts = InVT.getVectorNumElements();
25407
25408 // If the source elements are already all-signbits, we don't need to extend,
25409 // just splat the elements.
25410 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25411 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25412 unsigned Scale = InNumElts / NumElts;
25413 SmallVector<int, 16> ShuffleMask;
25414 for (unsigned I = 0; I != NumElts; ++I)
25415 ShuffleMask.append(Scale, I);
25416 return DAG.getBitcast(VT,
25417 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25418 }
25419
25420 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25421 SDValue Curr = In;
25422 SDValue SignExt = Curr;
25423
25424 // As SRAI is only available on i16/i32 types, we expand only up to i32
25425 // and handle i64 separately.
25426 if (InVT != MVT::v4i32) {
25427 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25428
25429 unsigned DestWidth = DestVT.getScalarSizeInBits();
25430 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25431 unsigned DestElts = DestVT.getVectorNumElements();
25432
25433 // Build a shuffle mask that takes each input element and places it in the
25434 // MSBs of the new element size.
25435 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25436 for (unsigned i = 0; i != DestElts; ++i)
25437 Mask[i * Scale + (Scale - 1)] = i;
25438
25439 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25440 Curr = DAG.getBitcast(DestVT, Curr);
25441
25442 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25443 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25444 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25445 }
25446
25447 if (VT == MVT::v2i64) {
25448 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25449 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25450 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25451 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25452 SignExt = DAG.getBitcast(VT, SignExt);
25453 }
25454
25455 return SignExt;
25456}
25457
25459 SelectionDAG &DAG) {
25460 MVT VT = Op->getSimpleValueType(0);
25461 SDValue In = Op->getOperand(0);
25462 MVT InVT = In.getSimpleValueType();
25463 SDLoc dl(Op);
25464
25465 if (InVT.getVectorElementType() == MVT::i1)
25466 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25467
25468 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25470 "Expected same number of elements");
25471 assert((VT.getVectorElementType() == MVT::i16 ||
25472 VT.getVectorElementType() == MVT::i32 ||
25473 VT.getVectorElementType() == MVT::i64) &&
25474 "Unexpected element type");
25475 assert((InVT.getVectorElementType() == MVT::i8 ||
25476 InVT.getVectorElementType() == MVT::i16 ||
25477 InVT.getVectorElementType() == MVT::i32) &&
25478 "Unexpected element type");
25479
25480 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25481 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25482 return splitVectorIntUnary(Op, DAG, dl);
25483 }
25484
25485 if (Subtarget.hasInt256())
25486 return Op;
25487
25488 // Optimize vectors in AVX mode
25489 // Sign extend v8i16 to v8i32 and
25490 // v4i32 to v4i64
25491 //
25492 // Divide input vector into two parts
25493 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25494 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25495 // concat the vectors to original VT
25496 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25497 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25498
25499 unsigned NumElems = InVT.getVectorNumElements();
25500 SmallVector<int,8> ShufMask(NumElems, -1);
25501 for (unsigned i = 0; i != NumElems/2; ++i)
25502 ShufMask[i] = i + NumElems/2;
25503
25504 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25505 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25506
25507 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25508}
25509
25510/// Change a vector store into a pair of half-size vector stores.
25512 SDValue StoredVal = Store->getValue();
25513 assert((StoredVal.getValueType().is256BitVector() ||
25514 StoredVal.getValueType().is512BitVector()) &&
25515 "Expecting 256/512-bit op");
25516
25517 // Splitting volatile memory ops is not allowed unless the operation was not
25518 // legal to begin with. Assume the input store is legal (this transform is
25519 // only used for targets with AVX). Note: It is possible that we have an
25520 // illegal type like v2i128, and so we could allow splitting a volatile store
25521 // in that case if that is important.
25522 if (!Store->isSimple())
25523 return SDValue();
25524
25525 SDLoc DL(Store);
25526 SDValue Value0, Value1;
25527 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25528 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25529 SDValue Ptr0 = Store->getBasePtr();
25530 SDValue Ptr1 =
25531 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25532 SDValue Ch0 =
25533 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25534 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25535 SDValue Ch1 =
25536 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25537 Store->getPointerInfo().getWithOffset(HalfOffset),
25538 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25539 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25540}
25541
25542/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25543/// type.
25545 SelectionDAG &DAG) {
25546 SDValue StoredVal = Store->getValue();
25547 assert(StoreVT.is128BitVector() &&
25548 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25549 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25550
25551 // Splitting volatile memory ops is not allowed unless the operation was not
25552 // legal to begin with. We are assuming the input op is legal (this transform
25553 // is only used for targets with AVX).
25554 if (!Store->isSimple())
25555 return SDValue();
25556
25557 MVT StoreSVT = StoreVT.getScalarType();
25558 unsigned NumElems = StoreVT.getVectorNumElements();
25559 unsigned ScalarSize = StoreSVT.getStoreSize();
25560
25561 SDLoc DL(Store);
25563 for (unsigned i = 0; i != NumElems; ++i) {
25564 unsigned Offset = i * ScalarSize;
25565 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25567 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25568 DAG.getVectorIdxConstant(i, DL));
25569 SDValue Ch =
25570 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25571 Store->getPointerInfo().getWithOffset(Offset),
25572 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25573 Stores.push_back(Ch);
25574 }
25575 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25576}
25577
25578static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25579 SelectionDAG &DAG) {
25580 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25581 SDLoc dl(St);
25582 SDValue StoredVal = St->getValue();
25583
25584 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25585 if (StoredVal.getValueType().isVector() &&
25586 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25587 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25588 assert(NumElts <= 8 && "Unexpected VT");
25589 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25590 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25591 "Expected AVX512F without AVX512DQI");
25592
25593 // We must pad with zeros to ensure we store zeroes to any unused bits.
25594 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25595 DAG.getUNDEF(MVT::v16i1), StoredVal,
25596 DAG.getVectorIdxConstant(0, dl));
25597 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25598 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25599 // Make sure we store zeros in the extra bits.
25600 if (NumElts < 8)
25601 StoredVal = DAG.getZeroExtendInReg(
25602 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25603
25604 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25605 St->getPointerInfo(), St->getBaseAlign(),
25606 St->getMemOperand()->getFlags());
25607 }
25608
25609 if (St->isTruncatingStore())
25610 return SDValue();
25611
25612 // If this is a 256/512-bit store of concatenated ops, we are better off
25613 // splitting that store into two half-size stores. This avoids spurious use of
25614 // concatenated ops and each half can execute independently. Some cores would
25615 // split the op into halves anyway, so the concat is purely an extra op.
25616 MVT StoreVT = StoredVal.getSimpleValueType();
25617 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25618 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25619 return splitVectorStore(St, DAG);
25620 return SDValue();
25621 }
25622
25623 if (StoreVT.is32BitVector())
25624 return SDValue();
25625
25626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25627 assert(StoreVT.is64BitVector() && "Unexpected VT");
25628 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25630 "Unexpected type action!");
25631
25632 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25633 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25634 DAG.getUNDEF(StoreVT));
25635
25636 if (Subtarget.hasSSE2()) {
25637 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25638 // and store it.
25639 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25640 MVT CastVT = MVT::getVectorVT(StVT, 2);
25641 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25642 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25643 DAG.getVectorIdxConstant(0, dl));
25644
25645 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25646 St->getPointerInfo(), St->getBaseAlign(),
25647 St->getMemOperand()->getFlags());
25648 }
25649 assert(Subtarget.hasSSE1() && "Expected SSE");
25650 SDVTList Tys = DAG.getVTList(MVT::Other);
25651 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25652 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25653 St->getMemOperand());
25654}
25655
25656// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25657// may emit an illegal shuffle but the expansion is still better than scalar
25658// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25659// we'll emit a shuffle and a arithmetic shift.
25660// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25661// TODO: It is possible to support ZExt by zeroing the undef values during
25662// the shuffle phase or after the shuffle.
25663static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25664 SelectionDAG &DAG) {
25665 MVT RegVT = Op.getSimpleValueType();
25666 assert(RegVT.isVector() && "We only custom lower vector loads.");
25667 assert(RegVT.isInteger() &&
25668 "We only custom lower integer vector loads.");
25669
25670 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25671 SDLoc dl(Ld);
25672
25673 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25674 if (RegVT.getVectorElementType() == MVT::i1) {
25675 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25676 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25677 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25678 "Expected AVX512F without AVX512DQI");
25679
25680 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25681 Ld->getPointerInfo(), Ld->getBaseAlign(),
25682 Ld->getMemOperand()->getFlags());
25683
25684 // Replace chain users with the new chain.
25685 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25686
25687 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25688 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25689 DAG.getBitcast(MVT::v16i1, Val),
25690 DAG.getVectorIdxConstant(0, dl));
25691 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25692 }
25693
25694 return SDValue();
25695}
25696
25697/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25698/// each of which has no other use apart from the AND / OR.
25699static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25700 Opc = Op.getOpcode();
25701 if (Opc != ISD::OR && Opc != ISD::AND)
25702 return false;
25703 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25704 Op.getOperand(0).hasOneUse() &&
25705 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25706 Op.getOperand(1).hasOneUse());
25707}
25708
25709SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25710 SDValue Chain = Op.getOperand(0);
25711 SDValue Cond = Op.getOperand(1);
25712 SDValue Dest = Op.getOperand(2);
25713 SDLoc dl(Op);
25714
25715 // Bail out when we don't have native compare instructions.
25716 if (Cond.getOpcode() == ISD::SETCC &&
25717 Cond.getOperand(0).getValueType() != MVT::f128 &&
25718 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25719 SDValue LHS = Cond.getOperand(0);
25720 SDValue RHS = Cond.getOperand(1);
25721 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25722
25723 // Special case for
25724 // setcc([su]{add,sub,mul}o == 0)
25725 // setcc([su]{add,sub,mul}o != 1)
25727 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25729 SDValue Value, Overflow;
25730 X86::CondCode X86Cond;
25731 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25732
25733 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25734 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25735
25736 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25737 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25738 Overflow, Op->getFlags());
25739 }
25740
25741 if (LHS.getSimpleValueType().isInteger()) {
25742 SDValue CCVal;
25743 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25744 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25745 EFLAGS, Op->getFlags());
25746 }
25747
25748 if (CC == ISD::SETOEQ) {
25749 // For FCMP_OEQ, we can emit
25750 // two branches instead of an explicit AND instruction with a
25751 // separate test. However, we only do this if this block doesn't
25752 // have a fall-through edge, because this requires an explicit
25753 // jmp when the condition is false.
25754 if (Op.getNode()->hasOneUse()) {
25755 SDNode *User = *Op.getNode()->user_begin();
25756 // Look for an unconditional branch following this conditional branch.
25757 // We need this because we need to reverse the successors in order
25758 // to implement FCMP_OEQ.
25759 if (User->getOpcode() == ISD::BR) {
25760 SDValue FalseBB = User->getOperand(1);
25761 SDNode *NewBR =
25762 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25763 assert(NewBR == User);
25764 (void)NewBR;
25765 Dest = FalseBB;
25766
25767 SDValue Cmp =
25768 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25769 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25770 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25771 CCVal, Cmp, Op->getFlags());
25772 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25773 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25774 Cmp, Op->getFlags());
25775 }
25776 }
25777 } else if (CC == ISD::SETUNE) {
25778 // For FCMP_UNE, we can emit
25779 // two branches instead of an explicit OR instruction with a
25780 // separate test.
25781 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25782 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25783 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25784 Cmp, Op->getFlags());
25785 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25786 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25787 Cmp, Op->getFlags());
25788 } else {
25789 X86::CondCode X86Cond =
25790 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25791 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25792 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25793 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25794 Cmp, Op->getFlags());
25795 }
25796 }
25797
25799 SDValue Value, Overflow;
25800 X86::CondCode X86Cond;
25801 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25802
25803 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25804 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25805 Overflow, Op->getFlags());
25806 }
25807
25808 // Look past the truncate if the high bits are known zero.
25810 Cond = Cond.getOperand(0);
25811
25812 EVT CondVT = Cond.getValueType();
25813
25814 // Add an AND with 1 if we don't already have one.
25815 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25816 Cond =
25817 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25818
25819 SDValue LHS = Cond;
25820 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25821
25822 SDValue CCVal;
25823 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25824 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25825 Op->getFlags());
25826}
25827
25828// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25829// Calls to _alloca are needed to probe the stack when allocating more than 4k
25830// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25831// that the guard pages used by the OS virtual memory manager are allocated in
25832// correct sequence.
25833SDValue
25834X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25835 SelectionDAG &DAG) const {
25836 MachineFunction &MF = DAG.getMachineFunction();
25837 bool SplitStack = MF.shouldSplitStack();
25838 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25839 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25840 SplitStack || EmitStackProbeCall;
25841 SDLoc dl(Op);
25842
25843 // Get the inputs.
25844 SDNode *Node = Op.getNode();
25845 SDValue Chain = Op.getOperand(0);
25846 SDValue Size = Op.getOperand(1);
25847 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25848 EVT VT = Node->getValueType(0);
25849
25850 // Chain the dynamic stack allocation so that it doesn't modify the stack
25851 // pointer when other instructions are using the stack.
25852 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25853
25854 bool Is64Bit = Subtarget.is64Bit();
25855 MVT SPTy = Op.getValueType().getSimpleVT();
25856
25858 if (!Lower) {
25859 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25861 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25862 " not tell us which reg is the stack pointer!");
25863
25864 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25865 const Align StackAlign = TFI.getStackAlign();
25866 if (hasInlineStackProbe(MF)) {
25867 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25868 {Chain, Size});
25869 Chain = Result.getValue(1);
25870 } else {
25871 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25872 Chain = SP.getValue(1);
25873 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25874 }
25875 if (Alignment && *Alignment > StackAlign)
25876 Result = DAG.getNode(
25877 ISD::AND, dl, VT, Result,
25878 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25879 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25880 } else if (SplitStack) {
25881 if (Is64Bit) {
25882 // The 64 bit implementation of segmented stacks needs to clobber both r10
25883 // r11. This makes it impossible to use it along with nested parameters.
25884 const Function &F = MF.getFunction();
25885 for (const auto &A : F.args()) {
25886 if (A.hasNestAttr())
25887 report_fatal_error("Cannot use segmented stacks with functions that "
25888 "have nested arguments.");
25889 }
25890 }
25891
25892 Result =
25893 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25894 Chain = Result.getValue(1);
25895 } else {
25896 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25897 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25898 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25899
25900 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25901 Register SPReg = RegInfo->getStackRegister();
25902 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25903 Chain = SP.getValue(1);
25904
25905 if (Alignment) {
25906 SP = DAG.getNode(
25907 ISD::AND, dl, VT, SP.getValue(0),
25908 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25909 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25910 }
25911
25912 Result = SP;
25913 }
25914
25915 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25916
25917 SDValue Ops[2] = {Result, Chain};
25918 return DAG.getMergeValues(Ops, dl);
25919}
25920
25921SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25922 MachineFunction &MF = DAG.getMachineFunction();
25923 SDValue Ptr = Op.getOperand(1);
25924 EVT PtrVT = Ptr.getValueType();
25925
25926 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25927
25928 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25929 SDLoc DL(Op);
25930
25931 if (!Subtarget.is64Bit() ||
25932 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25933 // vastart just stores the address of the VarArgsFrameIndex slot into the
25934 // memory location argument.
25935 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25936 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25937 }
25938
25939 // __va_list_tag:
25940 // gp_offset (0 - 6 * 8)
25941 // fp_offset (48 - 48 + 8 * 16)
25942 // overflow_arg_area (point to parameters coming in memory).
25943 // reg_save_area
25945 SDValue FIN = Op.getOperand(1);
25946 // Store gp_offset
25947 SDValue Store = DAG.getStore(
25948 Op.getOperand(0), DL,
25949 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25950 MachinePointerInfo(SV));
25951 MemOps.push_back(Store);
25952
25953 // Store fp_offset
25954 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25955 Store = DAG.getStore(
25956 Op.getOperand(0), DL,
25957 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25958 MachinePointerInfo(SV, 4));
25959 MemOps.push_back(Store);
25960
25961 // Store ptr to overflow_arg_area
25962 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25963 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25964 Store =
25965 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25966 MemOps.push_back(Store);
25967
25968 // Store ptr to reg_save_area.
25969 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25970 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25971 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25972 Store = DAG.getStore(
25973 Op.getOperand(0), DL, RSFIN, FIN,
25974 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25975 MemOps.push_back(Store);
25976 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25977}
25978
25979SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25980 assert(Subtarget.is64Bit() &&
25981 "LowerVAARG only handles 64-bit va_arg!");
25982 assert(Op.getNumOperands() == 4);
25983
25984 MachineFunction &MF = DAG.getMachineFunction();
25985 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25986 // The Win64 ABI uses char* instead of a structure.
25987 return DAG.expandVAArg(Op.getNode());
25988
25989 SDValue Chain = Op.getOperand(0);
25990 SDValue SrcPtr = Op.getOperand(1);
25991 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25992 unsigned Align = Op.getConstantOperandVal(3);
25993 SDLoc dl(Op);
25994
25995 EVT ArgVT = Op.getNode()->getValueType(0);
25996 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25997 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25998 uint8_t ArgMode;
25999
26000 // Decide which area this value should be read from.
26001 // TODO: Implement the AMD64 ABI in its entirety. This simple
26002 // selection mechanism works only for the basic types.
26003 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26004 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26005 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26006 } else {
26007 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26008 "Unhandled argument type in LowerVAARG");
26009 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26010 }
26011
26012 if (ArgMode == 2) {
26013 // Make sure using fp_offset makes sense.
26014 assert(!Subtarget.useSoftFloat() &&
26015 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26016 Subtarget.hasSSE1());
26017 }
26018
26019 // Insert VAARG node into the DAG
26020 // VAARG returns two values: Variable Argument Address, Chain
26021 SDValue InstOps[] = {Chain, SrcPtr,
26022 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26023 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26024 DAG.getTargetConstant(Align, dl, MVT::i32)};
26025 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26026 SDValue VAARG = DAG.getMemIntrinsicNode(
26027 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26028 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26029 /*Alignment=*/std::nullopt,
26031 Chain = VAARG.getValue(1);
26032
26033 // Load the next argument and return it
26034 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26035}
26036
26037static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26038 SelectionDAG &DAG) {
26039 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26040 // where a va_list is still an i8*.
26041 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26042 if (Subtarget.isCallingConvWin64(
26044 // Probably a Win64 va_copy.
26045 return DAG.expandVACopy(Op.getNode());
26046
26047 SDValue Chain = Op.getOperand(0);
26048 SDValue DstPtr = Op.getOperand(1);
26049 SDValue SrcPtr = Op.getOperand(2);
26050 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26051 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26052 SDLoc DL(Op);
26053
26054 return DAG.getMemcpy(
26055 Chain, DL, DstPtr, SrcPtr,
26056 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26057 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26058 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26059 MachinePointerInfo(SrcSV));
26060}
26061
26062// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26063static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26064 switch (Opc) {
26065 case ISD::SHL:
26066 case X86ISD::VSHL:
26067 case X86ISD::VSHLI:
26068 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26069 case ISD::SRL:
26070 case X86ISD::VSRL:
26071 case X86ISD::VSRLI:
26072 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26073 case ISD::SRA:
26074 case X86ISD::VSRA:
26075 case X86ISD::VSRAI:
26076 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26077 }
26078 llvm_unreachable("Unknown target vector shift node");
26079}
26080
26081/// Handle vector element shifts where the shift amount is a constant.
26082/// Takes immediate version of shift as input.
26083static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26084 SDValue SrcOp, uint64_t ShiftAmt,
26085 SelectionDAG &DAG) {
26086 MVT ElementType = VT.getVectorElementType();
26087
26088 // Bitcast the source vector to the output type, this is mainly necessary for
26089 // vXi8/vXi64 shifts.
26090 if (VT != SrcOp.getSimpleValueType())
26091 SrcOp = DAG.getBitcast(VT, SrcOp);
26092
26093 // Fold this packed shift into its first operand if ShiftAmt is 0.
26094 if (ShiftAmt == 0)
26095 return SrcOp;
26096
26097 // Check for ShiftAmt >= element width
26098 if (ShiftAmt >= ElementType.getSizeInBits()) {
26099 if (Opc == X86ISD::VSRAI)
26100 ShiftAmt = ElementType.getSizeInBits() - 1;
26101 else
26102 return DAG.getConstant(0, dl, VT);
26103 }
26104
26106 && "Unknown target vector shift-by-constant node");
26107
26108 // Fold this packed vector shift into a build vector if SrcOp is a
26109 // vector of Constants or UNDEFs.
26111 unsigned ShiftOpc;
26112 switch (Opc) {
26113 default: llvm_unreachable("Unknown opcode!");
26114 case X86ISD::VSHLI:
26115 ShiftOpc = ISD::SHL;
26116 break;
26117 case X86ISD::VSRLI:
26118 ShiftOpc = ISD::SRL;
26119 break;
26120 case X86ISD::VSRAI:
26121 ShiftOpc = ISD::SRA;
26122 break;
26123 }
26124
26125 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26126 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26127 return C;
26128 }
26129
26130 return DAG.getNode(Opc, dl, VT, SrcOp,
26131 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26132}
26133
26134/// Handle vector element shifts by a splat shift amount
26135static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26136 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26137 const X86Subtarget &Subtarget,
26138 SelectionDAG &DAG) {
26139 MVT AmtVT = ShAmt.getSimpleValueType();
26140 assert(AmtVT.isVector() && "Vector shift type mismatch");
26141 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26142 "Illegal vector splat index");
26143
26144 // Move the splat element to the bottom element.
26145 if (ShAmtIdx != 0) {
26146 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26147 Mask[0] = ShAmtIdx;
26148 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26149 }
26150
26151 // Peek through any zext node if we can get back to a 128-bit source.
26152 if (AmtVT.getScalarSizeInBits() == 64 &&
26153 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26155 ShAmt.getOperand(0).getValueType().isSimple() &&
26156 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26157 ShAmt = ShAmt.getOperand(0);
26158 AmtVT = ShAmt.getSimpleValueType();
26159 }
26160
26161 // See if we can mask off the upper elements using the existing source node.
26162 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26163 // do this for vXi64 types.
26164 bool IsMasked = false;
26165 if (AmtVT.getScalarSizeInBits() < 64) {
26166 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26167 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26168 // If the shift amount has come from a scalar, then zero-extend the scalar
26169 // before moving to the vector.
26170 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26171 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26172 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26173 AmtVT = MVT::v4i32;
26174 IsMasked = true;
26175 } else if (ShAmt.getOpcode() == ISD::AND) {
26176 // See if the shift amount is already masked (e.g. for rotation modulo),
26177 // then we can zero-extend it by setting all the other mask elements to
26178 // zero.
26179 SmallVector<SDValue> MaskElts(
26180 AmtVT.getVectorNumElements(),
26181 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26182 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26183 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26184 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26185 {ShAmt.getOperand(1), Mask}))) {
26186 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26187 IsMasked = true;
26188 }
26189 }
26190 }
26191
26192 // Extract if the shift amount vector is larger than 128-bits.
26193 if (AmtVT.getSizeInBits() > 128) {
26194 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26195 AmtVT = ShAmt.getSimpleValueType();
26196 }
26197
26198 // Zero-extend bottom element to v2i64 vector type, either by extension or
26199 // shuffle masking.
26200 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26201 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26202 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26203 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26204 } else if (Subtarget.hasSSE41()) {
26205 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26206 MVT::v2i64, ShAmt);
26207 } else {
26208 SDValue ByteShift = DAG.getTargetConstant(
26209 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26210 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26211 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26212 ByteShift);
26213 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26214 ByteShift);
26215 }
26216 }
26217
26218 // Change opcode to non-immediate version.
26220
26221 // The return type has to be a 128-bit type with the same element
26222 // type as the input type.
26223 MVT EltVT = VT.getVectorElementType();
26224 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26225
26226 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26227 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26228}
26229
26230/// Return Mask with the necessary casting or extending
26231/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26232static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26233 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26234 const SDLoc &dl) {
26235
26236 if (isAllOnesConstant(Mask))
26237 return DAG.getConstant(1, dl, MaskVT);
26238 if (X86::isZeroNode(Mask))
26239 return DAG.getConstant(0, dl, MaskVT);
26240
26241 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26242
26243 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26244 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26245 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26246 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26247 SDValue Lo, Hi;
26248 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26249 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26250 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26251 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26252 } else {
26253 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26254 Mask.getSimpleValueType().getSizeInBits());
26255 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26256 // are extracted by EXTRACT_SUBVECTOR.
26257 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26258 DAG.getBitcast(BitcastVT, Mask),
26259 DAG.getVectorIdxConstant(0, dl));
26260 }
26261}
26262
26263/// Return (and \p Op, \p Mask) for compare instructions or
26264/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26265/// necessary casting or extending for \p Mask when lowering masking intrinsics
26267 SDValue PreservedSrc,
26268 const X86Subtarget &Subtarget,
26269 SelectionDAG &DAG) {
26270 MVT VT = Op.getSimpleValueType();
26271 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26272 unsigned OpcodeSelect = ISD::VSELECT;
26273 SDLoc dl(Op);
26274
26275 if (isAllOnesConstant(Mask))
26276 return Op;
26277
26278 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26279
26280 if (PreservedSrc.isUndef())
26281 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26282 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26283}
26284
26285/// Creates an SDNode for a predicated scalar operation.
26286/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26287/// The mask is coming as MVT::i8 and it should be transformed
26288/// to MVT::v1i1 while lowering masking intrinsics.
26289/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26290/// "X86select" instead of "vselect". We just can't create the "vselect" node
26291/// for a scalar instruction.
26293 SDValue PreservedSrc,
26294 const X86Subtarget &Subtarget,
26295 SelectionDAG &DAG) {
26296 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26297 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26298 return Op;
26299
26300 MVT VT = Op.getSimpleValueType();
26301 SDLoc dl(Op);
26302
26303 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26304 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26305 DAG.getBitcast(MVT::v8i1, Mask),
26306 DAG.getVectorIdxConstant(0, dl));
26307 if (Op.getOpcode() == X86ISD::FSETCCM ||
26308 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26309 Op.getOpcode() == X86ISD::VFPCLASSS)
26310 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26311
26312 if (PreservedSrc.isUndef())
26313 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26314
26315 if (MaskConst) {
26316 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26317 // Discard op and blend passthrough with scalar op src/dst.
26319 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26320 ShuffleMask[0] = VT.getVectorNumElements();
26321 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26322 ShuffleMask);
26323 }
26324
26325 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26326}
26327
26329 if (!Fn->hasPersonalityFn())
26331 "querying registration node size for function without personality");
26332 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26333 // WinEHStatePass for the full struct definition.
26334 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26335 case EHPersonality::MSVC_X86SEH: return 24;
26336 case EHPersonality::MSVC_CXX: return 16;
26337 default: break;
26338 }
26340 "can only recover FP for 32-bit MSVC EH personality functions");
26341}
26342
26343/// When the MSVC runtime transfers control to us, either to an outlined
26344/// function or when returning to a parent frame after catching an exception, we
26345/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26346/// Here's the math:
26347/// RegNodeBase = EntryEBP - RegNodeSize
26348/// ParentFP = RegNodeBase - ParentFrameOffset
26349/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26350/// subtracting the offset (negative on x86) takes us back to the parent FP.
26352 SDValue EntryEBP) {
26354 SDLoc dl;
26355
26356 // It's possible that the parent function no longer has a personality function
26357 // if the exceptional code was optimized away, in which case we just return
26358 // the incoming EBP.
26359 if (!Fn->hasPersonalityFn())
26360 return EntryEBP;
26361
26362 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26363 // registration, or the .set_setframe offset.
26366 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26367 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26368 SDValue ParentFrameOffset =
26369 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26370
26371 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26372 // prologue to RBP in the parent function.
26373 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26374 if (Subtarget.is64Bit())
26375 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26376
26377 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26378 // RegNodeBase = EntryEBP - RegNodeSize
26379 // ParentFP = RegNodeBase - ParentFrameOffset
26380 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26381 DAG.getConstant(RegNodeSize, dl, PtrVT));
26382 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26383}
26384
26385SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26386 SelectionDAG &DAG) const {
26387 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26388 auto isRoundModeCurDirection = [](SDValue Rnd) {
26389 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26390 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26391
26392 return false;
26393 };
26394 auto isRoundModeSAE = [](SDValue Rnd) {
26395 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26396 unsigned RC = C->getZExtValue();
26398 // Clear the NO_EXC bit and check remaining bits.
26400 // As a convenience we allow no other bits or explicitly
26401 // current direction.
26402 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26403 }
26404 }
26405
26406 return false;
26407 };
26408 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26409 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26410 RC = C->getZExtValue();
26412 // Clear the NO_EXC bit and check remaining bits.
26418 }
26419 }
26420
26421 return false;
26422 };
26423
26424 SDLoc dl(Op);
26425 unsigned IntNo = Op.getConstantOperandVal(0);
26426 MVT VT = Op.getSimpleValueType();
26427 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26428
26429 // Propagate flags from original node to transformed node(s).
26430 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26431
26432 if (IntrData) {
26433 switch(IntrData->Type) {
26434 case INTR_TYPE_1OP: {
26435 // We specify 2 possible opcodes for intrinsics with rounding modes.
26436 // First, we check if the intrinsic may have non-default rounding mode,
26437 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26438 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26439 if (IntrWithRoundingModeOpcode != 0) {
26440 SDValue Rnd = Op.getOperand(2);
26441 unsigned RC = 0;
26442 if (isRoundModeSAEToX(Rnd, RC))
26443 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26444 Op.getOperand(1),
26445 DAG.getTargetConstant(RC, dl, MVT::i32));
26446 if (!isRoundModeCurDirection(Rnd))
26447 return SDValue();
26448 }
26449 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26450 Op.getOperand(1));
26451 }
26452 case INTR_TYPE_1OP_SAE: {
26453 SDValue Sae = Op.getOperand(2);
26454
26455 unsigned Opc;
26456 if (isRoundModeCurDirection(Sae))
26457 Opc = IntrData->Opc0;
26458 else if (isRoundModeSAE(Sae))
26459 Opc = IntrData->Opc1;
26460 else
26461 return SDValue();
26462
26463 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26464 }
26465 case INTR_TYPE_2OP: {
26466 SDValue Src2 = Op.getOperand(2);
26467
26468 // We specify 2 possible opcodes for intrinsics with rounding modes.
26469 // First, we check if the intrinsic may have non-default rounding mode,
26470 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26471 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26472 if (IntrWithRoundingModeOpcode != 0) {
26473 SDValue Rnd = Op.getOperand(3);
26474 unsigned RC = 0;
26475 if (isRoundModeSAEToX(Rnd, RC))
26476 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26477 Op.getOperand(1), Src2,
26478 DAG.getTargetConstant(RC, dl, MVT::i32));
26479 if (!isRoundModeCurDirection(Rnd))
26480 return SDValue();
26481 }
26482
26483 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26484 Op.getOperand(1), Src2);
26485 }
26486 case INTR_TYPE_2OP_SAE: {
26487 SDValue Sae = Op.getOperand(3);
26488
26489 unsigned Opc;
26490 if (isRoundModeCurDirection(Sae))
26491 Opc = IntrData->Opc0;
26492 else if (isRoundModeSAE(Sae))
26493 Opc = IntrData->Opc1;
26494 else
26495 return SDValue();
26496
26497 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26498 Op.getOperand(2));
26499 }
26500 case INTR_TYPE_3OP:
26501 case INTR_TYPE_3OP_IMM8: {
26502 SDValue Src1 = Op.getOperand(1);
26503 SDValue Src2 = Op.getOperand(2);
26504 SDValue Src3 = Op.getOperand(3);
26505
26506 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26507 Src3.getValueType() != MVT::i8) {
26508 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26509 }
26510
26511 // We specify 2 possible opcodes for intrinsics with rounding modes.
26512 // First, we check if the intrinsic may have non-default rounding mode,
26513 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26514 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26515 if (IntrWithRoundingModeOpcode != 0) {
26516 SDValue Rnd = Op.getOperand(4);
26517 unsigned RC = 0;
26518 if (isRoundModeSAEToX(Rnd, RC))
26519 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26520 Src1, Src2, Src3,
26521 DAG.getTargetConstant(RC, dl, MVT::i32));
26522 if (!isRoundModeCurDirection(Rnd))
26523 return SDValue();
26524 }
26525
26526 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26527 {Src1, Src2, Src3});
26528 }
26529 case INTR_TYPE_4OP_IMM8: {
26530 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26531 SDValue Src4 = Op.getOperand(4);
26532 if (Src4.getValueType() != MVT::i8) {
26533 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26534 }
26535
26536 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26537 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26538 Src4);
26539 }
26540 case INTR_TYPE_1OP_MASK: {
26541 SDValue Src = Op.getOperand(1);
26542 SDValue PassThru = Op.getOperand(2);
26543 SDValue Mask = Op.getOperand(3);
26544 // We add rounding mode to the Node when
26545 // - RC Opcode is specified and
26546 // - RC is not "current direction".
26547 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26548 if (IntrWithRoundingModeOpcode != 0) {
26549 SDValue Rnd = Op.getOperand(4);
26550 unsigned RC = 0;
26551 if (isRoundModeSAEToX(Rnd, RC))
26552 return getVectorMaskingNode(
26553 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26554 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26555 Mask, PassThru, Subtarget, DAG);
26556 if (!isRoundModeCurDirection(Rnd))
26557 return SDValue();
26558 }
26559 return getVectorMaskingNode(
26560 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26561 Subtarget, DAG);
26562 }
26564 SDValue Src = Op.getOperand(1);
26565 SDValue PassThru = Op.getOperand(2);
26566 SDValue Mask = Op.getOperand(3);
26567 SDValue Rnd = Op.getOperand(4);
26568
26569 unsigned Opc;
26570 if (isRoundModeCurDirection(Rnd))
26571 Opc = IntrData->Opc0;
26572 else if (isRoundModeSAE(Rnd))
26573 Opc = IntrData->Opc1;
26574 else
26575 return SDValue();
26576
26577 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26578 Subtarget, DAG);
26579 }
26580 case INTR_TYPE_SCALAR_MASK: {
26581 SDValue Src1 = Op.getOperand(1);
26582 SDValue Src2 = Op.getOperand(2);
26583 SDValue passThru = Op.getOperand(3);
26584 SDValue Mask = Op.getOperand(4);
26585 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26586 // There are 2 kinds of intrinsics in this group:
26587 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26588 // (2) With rounding mode and sae - 7 operands.
26589 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26590 if (Op.getNumOperands() == (5U + HasRounding)) {
26591 if (HasRounding) {
26592 SDValue Rnd = Op.getOperand(5);
26593 unsigned RC = 0;
26594 if (isRoundModeSAEToX(Rnd, RC))
26595 return getScalarMaskingNode(
26596 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26597 DAG.getTargetConstant(RC, dl, MVT::i32)),
26598 Mask, passThru, Subtarget, DAG);
26599 if (!isRoundModeCurDirection(Rnd))
26600 return SDValue();
26601 }
26602 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26603 Src2),
26604 Mask, passThru, Subtarget, DAG);
26605 }
26606
26607 assert(Op.getNumOperands() == (6U + HasRounding) &&
26608 "Unexpected intrinsic form");
26609 SDValue RoundingMode = Op.getOperand(5);
26610 unsigned Opc = IntrData->Opc0;
26611 if (HasRounding) {
26612 SDValue Sae = Op.getOperand(6);
26613 if (isRoundModeSAE(Sae))
26614 Opc = IntrWithRoundingModeOpcode;
26615 else if (!isRoundModeCurDirection(Sae))
26616 return SDValue();
26617 }
26618 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26619 Src2, RoundingMode),
26620 Mask, passThru, Subtarget, DAG);
26621 }
26623 SDValue Src1 = Op.getOperand(1);
26624 SDValue Src2 = Op.getOperand(2);
26625 SDValue passThru = Op.getOperand(3);
26626 SDValue Mask = Op.getOperand(4);
26627 SDValue Rnd = Op.getOperand(5);
26628
26629 SDValue NewOp;
26630 unsigned RC = 0;
26631 if (isRoundModeCurDirection(Rnd))
26632 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26633 else if (isRoundModeSAEToX(Rnd, RC))
26634 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26635 DAG.getTargetConstant(RC, dl, MVT::i32));
26636 else
26637 return SDValue();
26638
26639 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26640 }
26642 SDValue Src1 = Op.getOperand(1);
26643 SDValue Src2 = Op.getOperand(2);
26644 SDValue passThru = Op.getOperand(3);
26645 SDValue Mask = Op.getOperand(4);
26646 SDValue Sae = Op.getOperand(5);
26647 unsigned Opc;
26648 if (isRoundModeCurDirection(Sae))
26649 Opc = IntrData->Opc0;
26650 else if (isRoundModeSAE(Sae))
26651 Opc = IntrData->Opc1;
26652 else
26653 return SDValue();
26654
26655 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26656 Mask, passThru, Subtarget, DAG);
26657 }
26658 case INTR_TYPE_2OP_MASK: {
26659 SDValue Src1 = Op.getOperand(1);
26660 SDValue Src2 = Op.getOperand(2);
26661 SDValue PassThru = Op.getOperand(3);
26662 SDValue Mask = Op.getOperand(4);
26663 SDValue NewOp;
26664 if (IntrData->Opc1 != 0) {
26665 SDValue Rnd = Op.getOperand(5);
26666 unsigned RC = 0;
26667 if (isRoundModeSAEToX(Rnd, RC))
26668 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26669 DAG.getTargetConstant(RC, dl, MVT::i32));
26670 else if (!isRoundModeCurDirection(Rnd))
26671 return SDValue();
26672 }
26673 if (!NewOp)
26674 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26675 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26676 }
26678 SDValue Src1 = Op.getOperand(1);
26679 SDValue Src2 = Op.getOperand(2);
26680 SDValue PassThru = Op.getOperand(3);
26681 SDValue Mask = Op.getOperand(4);
26682
26683 unsigned Opc = IntrData->Opc0;
26684 if (IntrData->Opc1 != 0) {
26685 SDValue Sae = Op.getOperand(5);
26686 if (isRoundModeSAE(Sae))
26687 Opc = IntrData->Opc1;
26688 else if (!isRoundModeCurDirection(Sae))
26689 return SDValue();
26690 }
26691
26692 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26693 Mask, PassThru, Subtarget, DAG);
26694 }
26696 SDValue Src1 = Op.getOperand(1);
26697 SDValue Src2 = Op.getOperand(2);
26698 SDValue Src3 = Op.getOperand(3);
26699 SDValue PassThru = Op.getOperand(4);
26700 SDValue Mask = Op.getOperand(5);
26701 SDValue Sae = Op.getOperand(6);
26702 unsigned Opc;
26703 if (isRoundModeCurDirection(Sae))
26704 Opc = IntrData->Opc0;
26705 else if (isRoundModeSAE(Sae))
26706 Opc = IntrData->Opc1;
26707 else
26708 return SDValue();
26709
26710 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26711 Mask, PassThru, Subtarget, DAG);
26712 }
26714 SDValue Src1 = Op.getOperand(1);
26715 SDValue Src2 = Op.getOperand(2);
26716 SDValue Src3 = Op.getOperand(3);
26717 SDValue PassThru = Op.getOperand(4);
26718 SDValue Mask = Op.getOperand(5);
26719
26720 unsigned Opc = IntrData->Opc0;
26721 if (IntrData->Opc1 != 0) {
26722 SDValue Sae = Op.getOperand(6);
26723 if (isRoundModeSAE(Sae))
26724 Opc = IntrData->Opc1;
26725 else if (!isRoundModeCurDirection(Sae))
26726 return SDValue();
26727 }
26728 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26729 Mask, PassThru, Subtarget, DAG);
26730 }
26731 case BLENDV: {
26732 SDValue Src1 = Op.getOperand(1);
26733 SDValue Src2 = Op.getOperand(2);
26734 SDValue Src3 = Op.getOperand(3);
26735
26736 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26737 Src3 = DAG.getBitcast(MaskVT, Src3);
26738
26739 // Reverse the operands to match VSELECT order.
26740 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26741 }
26742 case VPERM_2OP : {
26743 SDValue Src1 = Op.getOperand(1);
26744 SDValue Src2 = Op.getOperand(2);
26745
26746 // Swap Src1 and Src2 in the node creation
26747 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26748 }
26749 case CFMA_OP_MASKZ:
26750 case CFMA_OP_MASK: {
26751 SDValue Src1 = Op.getOperand(1);
26752 SDValue Src2 = Op.getOperand(2);
26753 SDValue Src3 = Op.getOperand(3);
26754 SDValue Mask = Op.getOperand(4);
26755 MVT VT = Op.getSimpleValueType();
26756
26757 SDValue PassThru = Src3;
26758 if (IntrData->Type == CFMA_OP_MASKZ)
26759 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26760
26761 // We add rounding mode to the Node when
26762 // - RC Opcode is specified and
26763 // - RC is not "current direction".
26764 SDValue NewOp;
26765 if (IntrData->Opc1 != 0) {
26766 SDValue Rnd = Op.getOperand(5);
26767 unsigned RC = 0;
26768 if (isRoundModeSAEToX(Rnd, RC))
26769 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26770 DAG.getTargetConstant(RC, dl, MVT::i32));
26771 else if (!isRoundModeCurDirection(Rnd))
26772 return SDValue();
26773 }
26774 if (!NewOp)
26775 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26776 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26777 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26778 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26779 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26780 }
26781 case IFMA_OP:
26782 // NOTE: We need to swizzle the operands to pass the multiply operands
26783 // first.
26784 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26785 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26786 case FPCLASSS: {
26787 SDValue Src1 = Op.getOperand(1);
26788 SDValue Imm = Op.getOperand(2);
26789 SDValue Mask = Op.getOperand(3);
26790 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26791 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26792 Subtarget, DAG);
26793 // Need to fill with zeros to ensure the bitcast will produce zeroes
26794 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26795 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26796 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26797 DAG.getVectorIdxConstant(0, dl));
26798 return DAG.getBitcast(MVT::i8, Ins);
26799 }
26800
26801 case CMP_MASK_CC: {
26802 MVT MaskVT = Op.getSimpleValueType();
26803 SDValue CC = Op.getOperand(3);
26804 SDValue Mask = Op.getOperand(4);
26805 // We specify 2 possible opcodes for intrinsics with rounding modes.
26806 // First, we check if the intrinsic may have non-default rounding mode,
26807 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26808 if (IntrData->Opc1 != 0) {
26809 SDValue Sae = Op.getOperand(5);
26810 if (isRoundModeSAE(Sae))
26811 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26812 Op.getOperand(2), CC, Mask, Sae);
26813 if (!isRoundModeCurDirection(Sae))
26814 return SDValue();
26815 }
26816 //default rounding mode
26817 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26818 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26819 }
26820 case CMP_MASK_SCALAR_CC: {
26821 SDValue Src1 = Op.getOperand(1);
26822 SDValue Src2 = Op.getOperand(2);
26823 SDValue CC = Op.getOperand(3);
26824 SDValue Mask = Op.getOperand(4);
26825
26826 SDValue Cmp;
26827 if (IntrData->Opc1 != 0) {
26828 SDValue Sae = Op.getOperand(5);
26829 if (isRoundModeSAE(Sae))
26830 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26831 else if (!isRoundModeCurDirection(Sae))
26832 return SDValue();
26833 }
26834 //default rounding mode
26835 if (!Cmp.getNode())
26836 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26837
26838 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26839 Subtarget, DAG);
26840 // Need to fill with zeros to ensure the bitcast will produce zeroes
26841 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26842 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26843 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26844 DAG.getVectorIdxConstant(0, dl));
26845 return DAG.getBitcast(MVT::i8, Ins);
26846 }
26847 case COMI: { // Comparison intrinsics
26848 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26849 SDValue LHS = Op.getOperand(1);
26850 SDValue RHS = Op.getOperand(2);
26851 // Some conditions require the operands to be swapped.
26852 if (CC == ISD::SETLT || CC == ISD::SETLE)
26853 std::swap(LHS, RHS);
26854
26855 // For AVX10.2, Support EQ and NE.
26856 bool HasAVX10_2_COMX =
26857 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26858
26859 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26860 // For BF type we need to fall back.
26861 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26862
26863 auto ComiOpCode = IntrData->Opc0;
26864 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26865
26866 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26867 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26868
26869 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26870
26871 SDValue SetCC;
26872 switch (CC) {
26873 case ISD::SETEQ: {
26874 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26875 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26876 break;
26877 // (ZF = 1 and PF = 0)
26878 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26879 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26880 break;
26881 }
26882 case ISD::SETNE: {
26883 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26884 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26885 break;
26886 // (ZF = 0 or PF = 1)
26887 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26888 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26889 break;
26890 }
26891 case ISD::SETGT: // (CF = 0 and ZF = 0)
26892 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26893 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26894 break;
26895 }
26896 case ISD::SETGE: // CF = 0
26897 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26898 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26899 break;
26900 default:
26901 llvm_unreachable("Unexpected illegal condition!");
26902 }
26903 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26904 }
26905 case COMI_RM: { // Comparison intrinsics with Sae
26906 SDValue LHS = Op.getOperand(1);
26907 SDValue RHS = Op.getOperand(2);
26908 unsigned CondVal = Op.getConstantOperandVal(3);
26909 SDValue Sae = Op.getOperand(4);
26910
26911 SDValue FCmp;
26912 if (isRoundModeCurDirection(Sae))
26913 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26914 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26915 else if (isRoundModeSAE(Sae))
26916 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26917 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26918 else
26919 return SDValue();
26920 // Need to fill with zeros to ensure the bitcast will produce zeroes
26921 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26922 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26923 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26924 DAG.getVectorIdxConstant(0, dl));
26925 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26926 DAG.getBitcast(MVT::i16, Ins));
26927 }
26928 case VSHIFT: {
26929 SDValue SrcOp = Op.getOperand(1);
26930 SDValue ShAmt = Op.getOperand(2);
26931 assert(ShAmt.getValueType() == MVT::i32 &&
26932 "Unexpected VSHIFT amount type");
26933
26934 // Catch shift-by-constant.
26935 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26936 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26937 Op.getSimpleValueType(), SrcOp,
26938 CShAmt->getZExtValue(), DAG);
26939
26940 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26941 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26942 SrcOp, ShAmt, 0, Subtarget, DAG);
26943 }
26945 SDValue Mask = Op.getOperand(3);
26946 SDValue DataToCompress = Op.getOperand(1);
26947 SDValue PassThru = Op.getOperand(2);
26948 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26949 return Op.getOperand(1);
26950
26951 // Avoid false dependency.
26952 if (PassThru.isUndef())
26953 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26954
26955 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26956 Mask);
26957 }
26958 case FIXUPIMM:
26959 case FIXUPIMM_MASKZ: {
26960 SDValue Src1 = Op.getOperand(1);
26961 SDValue Src2 = Op.getOperand(2);
26962 SDValue Src3 = Op.getOperand(3);
26963 SDValue Imm = Op.getOperand(4);
26964 SDValue Mask = Op.getOperand(5);
26965 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26966 ? Src1
26967 : getZeroVector(VT, Subtarget, DAG, dl);
26968
26969 unsigned Opc = IntrData->Opc0;
26970 if (IntrData->Opc1 != 0) {
26971 SDValue Sae = Op.getOperand(6);
26972 if (isRoundModeSAE(Sae))
26973 Opc = IntrData->Opc1;
26974 else if (!isRoundModeCurDirection(Sae))
26975 return SDValue();
26976 }
26977
26978 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26979
26981 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26982
26983 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26984 }
26985 case ROUNDP: {
26986 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26987 // Clear the upper bits of the rounding immediate so that the legacy
26988 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26989 uint64_t Round = Op.getConstantOperandVal(2);
26990 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26991 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26992 Op.getOperand(1), RoundingMode);
26993 }
26994 case ROUNDS: {
26995 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26996 // Clear the upper bits of the rounding immediate so that the legacy
26997 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26998 uint64_t Round = Op.getConstantOperandVal(3);
26999 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
27000 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27001 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27002 }
27003 case BEXTRI: {
27004 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27005
27006 uint64_t Imm = Op.getConstantOperandVal(2);
27007 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27008 Op.getValueType());
27009 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27010 Op.getOperand(1), Control);
27011 }
27012 // ADC/SBB
27013 case ADX: {
27014 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27015 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27016
27017 SDValue Res;
27018 // If the carry in is zero, then we should just use ADD/SUB instead of
27019 // ADC/SBB.
27020 if (isNullConstant(Op.getOperand(1))) {
27021 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27022 Op.getOperand(3));
27023 } else {
27024 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27025 DAG.getAllOnesConstant(dl, MVT::i8));
27026 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27027 Op.getOperand(3), GenCF.getValue(1));
27028 }
27029 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27030 SDValue Results[] = { SetCC, Res };
27031 return DAG.getMergeValues(Results, dl);
27032 }
27033 case CVTPD2PS_MASK:
27034 case CVTPD2DQ_MASK:
27035 case CVTQQ2PS_MASK:
27036 case TRUNCATE_TO_REG: {
27037 SDValue Src = Op.getOperand(1);
27038 SDValue PassThru = Op.getOperand(2);
27039 SDValue Mask = Op.getOperand(3);
27040
27041 if (isAllOnesConstant(Mask))
27042 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27043
27044 MVT SrcVT = Src.getSimpleValueType();
27045 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27046 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27047 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27048 {Src, PassThru, Mask});
27049 }
27050 case TRUNCATE2_TO_REG: {
27051 SDValue Src = Op.getOperand(1);
27052 SDValue Src2 = Op.getOperand(2);
27053 SDValue PassThru = Op.getOperand(3);
27054 SDValue Mask = Op.getOperand(4);
27055
27056 if (isAllOnesConstant(Mask))
27057 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27058
27059 MVT Src2VT = Src2.getSimpleValueType();
27060 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27061 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27062 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27063 {Src, Src2, PassThru, Mask});
27064 }
27065 case CVTPS2PH_MASK: {
27066 SDValue Src = Op.getOperand(1);
27067 SDValue Rnd = Op.getOperand(2);
27068 SDValue PassThru = Op.getOperand(3);
27069 SDValue Mask = Op.getOperand(4);
27070
27071 unsigned RC = 0;
27072 unsigned Opc = IntrData->Opc0;
27073 bool SAE = Src.getValueType().is512BitVector() &&
27074 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27075 if (SAE) {
27077 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27078 }
27079
27080 if (isAllOnesConstant(Mask))
27081 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27082
27083 if (SAE)
27085 else
27086 Opc = IntrData->Opc1;
27087 MVT SrcVT = Src.getSimpleValueType();
27088 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27089 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27090 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27091 }
27092 case CVTNEPS2BF16_MASK: {
27093 SDValue Src = Op.getOperand(1);
27094 SDValue PassThru = Op.getOperand(2);
27095 SDValue Mask = Op.getOperand(3);
27096
27097 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27098 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27099
27100 // Break false dependency.
27101 if (PassThru.isUndef())
27102 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27103
27104 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27105 Mask);
27106 }
27107 default:
27108 break;
27109 }
27110 }
27111
27112 switch (IntNo) {
27113 default: return SDValue(); // Don't custom lower most intrinsics.
27114
27115 // ptest and testp intrinsics. The intrinsic these come from are designed to
27116 // return an integer value, not just an instruction so lower it to the ptest
27117 // or testp pattern and a setcc for the result.
27118 case Intrinsic::x86_avx512_ktestc_b:
27119 case Intrinsic::x86_avx512_ktestc_w:
27120 case Intrinsic::x86_avx512_ktestc_d:
27121 case Intrinsic::x86_avx512_ktestc_q:
27122 case Intrinsic::x86_avx512_ktestz_b:
27123 case Intrinsic::x86_avx512_ktestz_w:
27124 case Intrinsic::x86_avx512_ktestz_d:
27125 case Intrinsic::x86_avx512_ktestz_q:
27126 case Intrinsic::x86_sse41_ptestz:
27127 case Intrinsic::x86_sse41_ptestc:
27128 case Intrinsic::x86_sse41_ptestnzc:
27129 case Intrinsic::x86_avx_ptestz_256:
27130 case Intrinsic::x86_avx_ptestc_256:
27131 case Intrinsic::x86_avx_ptestnzc_256:
27132 case Intrinsic::x86_avx_vtestz_ps:
27133 case Intrinsic::x86_avx_vtestc_ps:
27134 case Intrinsic::x86_avx_vtestnzc_ps:
27135 case Intrinsic::x86_avx_vtestz_pd:
27136 case Intrinsic::x86_avx_vtestc_pd:
27137 case Intrinsic::x86_avx_vtestnzc_pd:
27138 case Intrinsic::x86_avx_vtestz_ps_256:
27139 case Intrinsic::x86_avx_vtestc_ps_256:
27140 case Intrinsic::x86_avx_vtestnzc_ps_256:
27141 case Intrinsic::x86_avx_vtestz_pd_256:
27142 case Intrinsic::x86_avx_vtestc_pd_256:
27143 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27144 unsigned TestOpc = X86ISD::PTEST;
27145 X86::CondCode X86CC;
27146 switch (IntNo) {
27147 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27148 case Intrinsic::x86_avx512_ktestc_b:
27149 case Intrinsic::x86_avx512_ktestc_w:
27150 case Intrinsic::x86_avx512_ktestc_d:
27151 case Intrinsic::x86_avx512_ktestc_q:
27152 // CF = 1
27153 TestOpc = X86ISD::KTEST;
27154 X86CC = X86::COND_B;
27155 break;
27156 case Intrinsic::x86_avx512_ktestz_b:
27157 case Intrinsic::x86_avx512_ktestz_w:
27158 case Intrinsic::x86_avx512_ktestz_d:
27159 case Intrinsic::x86_avx512_ktestz_q:
27160 TestOpc = X86ISD::KTEST;
27161 X86CC = X86::COND_E;
27162 break;
27163 case Intrinsic::x86_avx_vtestz_ps:
27164 case Intrinsic::x86_avx_vtestz_pd:
27165 case Intrinsic::x86_avx_vtestz_ps_256:
27166 case Intrinsic::x86_avx_vtestz_pd_256:
27167 TestOpc = X86ISD::TESTP;
27168 [[fallthrough]];
27169 case Intrinsic::x86_sse41_ptestz:
27170 case Intrinsic::x86_avx_ptestz_256:
27171 // ZF = 1
27172 X86CC = X86::COND_E;
27173 break;
27174 case Intrinsic::x86_avx_vtestc_ps:
27175 case Intrinsic::x86_avx_vtestc_pd:
27176 case Intrinsic::x86_avx_vtestc_ps_256:
27177 case Intrinsic::x86_avx_vtestc_pd_256:
27178 TestOpc = X86ISD::TESTP;
27179 [[fallthrough]];
27180 case Intrinsic::x86_sse41_ptestc:
27181 case Intrinsic::x86_avx_ptestc_256:
27182 // CF = 1
27183 X86CC = X86::COND_B;
27184 break;
27185 case Intrinsic::x86_avx_vtestnzc_ps:
27186 case Intrinsic::x86_avx_vtestnzc_pd:
27187 case Intrinsic::x86_avx_vtestnzc_ps_256:
27188 case Intrinsic::x86_avx_vtestnzc_pd_256:
27189 TestOpc = X86ISD::TESTP;
27190 [[fallthrough]];
27191 case Intrinsic::x86_sse41_ptestnzc:
27192 case Intrinsic::x86_avx_ptestnzc_256:
27193 // ZF and CF = 0
27194 X86CC = X86::COND_A;
27195 break;
27196 }
27197
27198 SDValue LHS = Op.getOperand(1);
27199 SDValue RHS = Op.getOperand(2);
27200 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27201 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27202 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27203 }
27204
27205 case Intrinsic::x86_sse42_pcmpistria128:
27206 case Intrinsic::x86_sse42_pcmpestria128:
27207 case Intrinsic::x86_sse42_pcmpistric128:
27208 case Intrinsic::x86_sse42_pcmpestric128:
27209 case Intrinsic::x86_sse42_pcmpistrio128:
27210 case Intrinsic::x86_sse42_pcmpestrio128:
27211 case Intrinsic::x86_sse42_pcmpistris128:
27212 case Intrinsic::x86_sse42_pcmpestris128:
27213 case Intrinsic::x86_sse42_pcmpistriz128:
27214 case Intrinsic::x86_sse42_pcmpestriz128: {
27215 unsigned Opcode;
27216 X86::CondCode X86CC;
27217 switch (IntNo) {
27218 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27219 case Intrinsic::x86_sse42_pcmpistria128:
27220 Opcode = X86ISD::PCMPISTR;
27221 X86CC = X86::COND_A;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpestria128:
27224 Opcode = X86ISD::PCMPESTR;
27225 X86CC = X86::COND_A;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpistric128:
27228 Opcode = X86ISD::PCMPISTR;
27229 X86CC = X86::COND_B;
27230 break;
27231 case Intrinsic::x86_sse42_pcmpestric128:
27232 Opcode = X86ISD::PCMPESTR;
27233 X86CC = X86::COND_B;
27234 break;
27235 case Intrinsic::x86_sse42_pcmpistrio128:
27236 Opcode = X86ISD::PCMPISTR;
27237 X86CC = X86::COND_O;
27238 break;
27239 case Intrinsic::x86_sse42_pcmpestrio128:
27240 Opcode = X86ISD::PCMPESTR;
27241 X86CC = X86::COND_O;
27242 break;
27243 case Intrinsic::x86_sse42_pcmpistris128:
27244 Opcode = X86ISD::PCMPISTR;
27245 X86CC = X86::COND_S;
27246 break;
27247 case Intrinsic::x86_sse42_pcmpestris128:
27248 Opcode = X86ISD::PCMPESTR;
27249 X86CC = X86::COND_S;
27250 break;
27251 case Intrinsic::x86_sse42_pcmpistriz128:
27252 Opcode = X86ISD::PCMPISTR;
27253 X86CC = X86::COND_E;
27254 break;
27255 case Intrinsic::x86_sse42_pcmpestriz128:
27256 Opcode = X86ISD::PCMPESTR;
27257 X86CC = X86::COND_E;
27258 break;
27259 }
27261 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27262 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27263 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27264 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27265 }
27266
27267 case Intrinsic::x86_sse42_pcmpistri128:
27268 case Intrinsic::x86_sse42_pcmpestri128: {
27269 unsigned Opcode;
27270 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27271 Opcode = X86ISD::PCMPISTR;
27272 else
27273 Opcode = X86ISD::PCMPESTR;
27274
27276 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27277 return DAG.getNode(Opcode, dl, VTs, NewOps);
27278 }
27279
27280 case Intrinsic::x86_sse42_pcmpistrm128:
27281 case Intrinsic::x86_sse42_pcmpestrm128: {
27282 unsigned Opcode;
27283 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27284 Opcode = X86ISD::PCMPISTR;
27285 else
27286 Opcode = X86ISD::PCMPESTR;
27287
27289 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27290 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27291 }
27292
27293 case Intrinsic::eh_sjlj_lsda: {
27294 MachineFunction &MF = DAG.getMachineFunction();
27295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27296 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27297 auto &Context = MF.getContext();
27298 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27299 Twine(MF.getFunctionNumber()));
27300 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27301 DAG.getMCSymbol(S, PtrVT));
27302 }
27303
27304 case Intrinsic::x86_seh_lsda: {
27305 // Compute the symbol for the LSDA. We know it'll get emitted later.
27306 MachineFunction &MF = DAG.getMachineFunction();
27307 SDValue Op1 = Op.getOperand(1);
27308 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27311
27312 // Generate a simple absolute symbol reference. This intrinsic is only
27313 // supported on 32-bit Windows, which isn't PIC.
27314 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27315 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27316 }
27317
27318 case Intrinsic::eh_recoverfp: {
27319 SDValue FnOp = Op.getOperand(1);
27320 SDValue IncomingFPOp = Op.getOperand(2);
27321 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27322 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27323 if (!Fn)
27325 "llvm.eh.recoverfp must take a function as the first argument");
27326 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27327 }
27328
27329 case Intrinsic::localaddress: {
27330 // Returns one of the stack, base, or frame pointer registers, depending on
27331 // which is used to reference local variables.
27332 MachineFunction &MF = DAG.getMachineFunction();
27333 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27334 Register Reg;
27335 if (RegInfo->hasBasePointer(MF))
27336 Reg = RegInfo->getBaseRegister();
27337 else { // Handles the SP or FP case.
27338 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27339 if (CantUseFP)
27340 Reg = RegInfo->getPtrSizedStackRegister(MF);
27341 else
27342 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27343 }
27344 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27345 }
27346 case Intrinsic::x86_avx512_vp2intersect_q_512:
27347 case Intrinsic::x86_avx512_vp2intersect_q_256:
27348 case Intrinsic::x86_avx512_vp2intersect_q_128:
27349 case Intrinsic::x86_avx512_vp2intersect_d_512:
27350 case Intrinsic::x86_avx512_vp2intersect_d_256:
27351 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27352 SDLoc DL(Op);
27353 MVT MaskVT = Op.getSimpleValueType();
27354 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27356 Op.getOperand(1), Op.getOperand(2));
27357 SDValue Result0 =
27358 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27359 SDValue Result1 =
27360 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27361 return DAG.getMergeValues({Result0, Result1}, DL);
27362 }
27363 case Intrinsic::x86_mmx_pslli_w:
27364 case Intrinsic::x86_mmx_pslli_d:
27365 case Intrinsic::x86_mmx_pslli_q:
27366 case Intrinsic::x86_mmx_psrli_w:
27367 case Intrinsic::x86_mmx_psrli_d:
27368 case Intrinsic::x86_mmx_psrli_q:
27369 case Intrinsic::x86_mmx_psrai_w:
27370 case Intrinsic::x86_mmx_psrai_d: {
27371 SDLoc DL(Op);
27372 SDValue ShAmt = Op.getOperand(2);
27373 // If the argument is a constant, convert it to a target constant.
27374 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27375 // Clamp out of bounds shift amounts since they will otherwise be masked
27376 // to 8-bits which may make it no longer out of bounds.
27377 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27378 if (ShiftAmount == 0)
27379 return Op.getOperand(1);
27380
27381 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27382 Op.getOperand(0), Op.getOperand(1),
27383 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27384 }
27385
27386 unsigned NewIntrinsic;
27387 switch (IntNo) {
27388 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27389 case Intrinsic::x86_mmx_pslli_w:
27390 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27391 break;
27392 case Intrinsic::x86_mmx_pslli_d:
27393 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27394 break;
27395 case Intrinsic::x86_mmx_pslli_q:
27396 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27397 break;
27398 case Intrinsic::x86_mmx_psrli_w:
27399 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27400 break;
27401 case Intrinsic::x86_mmx_psrli_d:
27402 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27403 break;
27404 case Intrinsic::x86_mmx_psrli_q:
27405 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27406 break;
27407 case Intrinsic::x86_mmx_psrai_w:
27408 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27409 break;
27410 case Intrinsic::x86_mmx_psrai_d:
27411 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27412 break;
27413 }
27414
27415 // The vector shift intrinsics with scalars uses 32b shift amounts but
27416 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27417 // MMX register.
27418 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27419 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27420 DAG.getTargetConstant(NewIntrinsic, DL,
27422 Op.getOperand(1), ShAmt);
27423 }
27424 case Intrinsic::thread_pointer: {
27425 if (Subtarget.isTargetELF()) {
27426 SDLoc dl(Op);
27427 EVT PtrVT = Op.getValueType();
27428 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27430 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27431 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27432 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27433 }
27435 "Target OS doesn't support __builtin_thread_pointer() yet.");
27436 }
27437 }
27438}
27439
27441 SDValue Src, SDValue Mask, SDValue Base,
27442 SDValue Index, SDValue ScaleOp, SDValue Chain,
27443 const X86Subtarget &Subtarget) {
27444 SDLoc dl(Op);
27445 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27446 // Scale must be constant.
27447 if (!C)
27448 return SDValue();
27449 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27450 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27451 TLI.getPointerTy(DAG.getDataLayout()));
27452 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27453 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27454 // If source is undef or we know it won't be used, use a zero vector
27455 // to break register dependency.
27456 // TODO: use undef instead and let BreakFalseDeps deal with it?
27457 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27458 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27459
27460 // Cast mask to an integer type.
27461 Mask = DAG.getBitcast(MaskVT, Mask);
27462
27464
27465 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27466 SDValue Res =
27468 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27469 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27470}
27471
27473 SDValue Src, SDValue Mask, SDValue Base,
27474 SDValue Index, SDValue ScaleOp, SDValue Chain,
27475 const X86Subtarget &Subtarget) {
27476 MVT VT = Op.getSimpleValueType();
27477 SDLoc dl(Op);
27478 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27479 // Scale must be constant.
27480 if (!C)
27481 return SDValue();
27482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27483 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27484 TLI.getPointerTy(DAG.getDataLayout()));
27485 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27487 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27488
27489 // We support two versions of the gather intrinsics. One with scalar mask and
27490 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27491 if (Mask.getValueType() != MaskVT)
27492 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27493
27494 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27495 // If source is undef or we know it won't be used, use a zero vector
27496 // to break register dependency.
27497 // TODO: use undef instead and let BreakFalseDeps deal with it?
27498 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27499 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27500
27502
27503 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27504 SDValue Res =
27506 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27507 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27508}
27509
27511 SDValue Src, SDValue Mask, SDValue Base,
27512 SDValue Index, SDValue ScaleOp, SDValue Chain,
27513 const X86Subtarget &Subtarget) {
27514 SDLoc dl(Op);
27515 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27516 // Scale must be constant.
27517 if (!C)
27518 return SDValue();
27519 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27520 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27521 TLI.getPointerTy(DAG.getDataLayout()));
27522 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27523 Src.getSimpleValueType().getVectorNumElements());
27524 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27525
27526 // We support two versions of the scatter intrinsics. One with scalar mask and
27527 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27528 if (Mask.getValueType() != MaskVT)
27529 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27530
27532
27533 SDVTList VTs = DAG.getVTList(MVT::Other);
27534 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27535 SDValue Res =
27537 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27538 return Res;
27539}
27540
27542 SDValue Mask, SDValue Base, SDValue Index,
27543 SDValue ScaleOp, SDValue Chain,
27544 const X86Subtarget &Subtarget) {
27545 SDLoc dl(Op);
27546 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27547 // Scale must be constant.
27548 if (!C)
27549 return SDValue();
27550 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27551 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27552 TLI.getPointerTy(DAG.getDataLayout()));
27553 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27554 SDValue Segment = DAG.getRegister(0, MVT::i32);
27555 MVT MaskVT =
27556 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27557 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27558 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27559 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27560 return SDValue(Res, 0);
27561}
27562
27563/// Handles the lowering of builtin intrinsics with chain that return their
27564/// value into registers EDX:EAX.
27565/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27566/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27567/// TargetOpcode.
27568/// Returns a Glue value which can be used to add extra copy-from-reg if the
27569/// expanded intrinsics implicitly defines extra registers (i.e. not just
27570/// EDX:EAX).
27572 SelectionDAG &DAG,
27573 unsigned TargetOpcode,
27574 unsigned SrcReg,
27575 const X86Subtarget &Subtarget,
27577 SDValue Chain = N->getOperand(0);
27578 SDValue Glue;
27579
27580 if (SrcReg) {
27581 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27582 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27583 Glue = Chain.getValue(1);
27584 }
27585
27586 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27587 SDValue N1Ops[] = {Chain, Glue};
27588 SDNode *N1 = DAG.getMachineNode(
27589 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27590 Chain = SDValue(N1, 0);
27591
27592 // Reads the content of XCR and returns it in registers EDX:EAX.
27593 SDValue LO, HI;
27594 if (Subtarget.is64Bit()) {
27595 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27596 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27597 LO.getValue(2));
27598 } else {
27599 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27600 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27601 LO.getValue(2));
27602 }
27603 Chain = HI.getValue(1);
27604 Glue = HI.getValue(2);
27605
27606 if (Subtarget.is64Bit()) {
27607 // Merge the two 32-bit values into a 64-bit one.
27608 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27609 DAG.getConstant(32, DL, MVT::i8));
27610 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27611 Results.push_back(Chain);
27612 return Glue;
27613 }
27614
27615 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27616 SDValue Ops[] = { LO, HI };
27617 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27618 Results.push_back(Pair);
27619 Results.push_back(Chain);
27620 return Glue;
27621}
27622
27623/// Handles the lowering of builtin intrinsics that read the time stamp counter
27624/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27625/// READCYCLECOUNTER nodes.
27626static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27627 SelectionDAG &DAG,
27628 const X86Subtarget &Subtarget,
27630 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27631 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27632 // and the EAX register is loaded with the low-order 32 bits.
27633 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27634 /* NoRegister */0, Subtarget,
27635 Results);
27636 if (Opcode != X86::RDTSCP)
27637 return;
27638
27639 SDValue Chain = Results[1];
27640 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27641 // the ECX register. Add 'ecx' explicitly to the chain.
27642 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27643 Results[1] = ecx;
27644 Results.push_back(ecx.getValue(1));
27645}
27646
27648 SelectionDAG &DAG) {
27650 SDLoc DL(Op);
27651 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27652 Results);
27653 return DAG.getMergeValues(Results, DL);
27654}
27655
27658 SDValue Chain = Op.getOperand(0);
27659 SDValue RegNode = Op.getOperand(2);
27660 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27661 if (!EHInfo)
27662 report_fatal_error("EH registrations only live in functions using WinEH");
27663
27664 // Cast the operand to an alloca, and remember the frame index.
27665 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27666 if (!FINode)
27667 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27668 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27669
27670 // Return the chain operand without making any DAG nodes.
27671 return Chain;
27672}
27673
27676 SDValue Chain = Op.getOperand(0);
27677 SDValue EHGuard = Op.getOperand(2);
27678 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27679 if (!EHInfo)
27680 report_fatal_error("EHGuard only live in functions using WinEH");
27681
27682 // Cast the operand to an alloca, and remember the frame index.
27683 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27684 if (!FINode)
27685 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27686 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27687
27688 // Return the chain operand without making any DAG nodes.
27689 return Chain;
27690}
27691
27692/// Emit Truncating Store with signed or unsigned saturation.
27693static SDValue
27694EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27695 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27696 SelectionDAG &DAG) {
27697 SDVTList VTs = DAG.getVTList(MVT::Other);
27698 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27699 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27700 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27701 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27702}
27703
27704/// Emit Masked Truncating Store with signed or unsigned saturation.
27705static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27706 const SDLoc &DL,
27707 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27708 MachineMemOperand *MMO, SelectionDAG &DAG) {
27709 SDVTList VTs = DAG.getVTList(MVT::Other);
27710 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27711 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27712 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27713}
27714
27716 const MachineFunction &MF) {
27717 if (!Subtarget.is64Bit())
27718 return false;
27719 // 64-bit targets support extended Swift async frame setup,
27720 // except for targets that use the windows 64 prologue.
27721 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27722}
27723
27725 SelectionDAG &DAG) {
27726 unsigned IntNo = Op.getConstantOperandVal(1);
27727 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27728 if (!IntrData) {
27729 switch (IntNo) {
27730
27731 case Intrinsic::swift_async_context_addr: {
27732 SDLoc dl(Op);
27733 auto &MF = DAG.getMachineFunction();
27734 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27735 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27737 X86FI->setHasSwiftAsyncContext(true);
27738 SDValue Chain = Op->getOperand(0);
27739 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27740 SDValue Result =
27741 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27742 DAG.getTargetConstant(8, dl, MVT::i32)),
27743 0);
27744 // Return { result, chain }.
27745 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27746 CopyRBP.getValue(1));
27747 } else {
27748 // No special extended frame, create or reuse an existing stack slot.
27749 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27750 if (!X86FI->getSwiftAsyncContextFrameIdx())
27751 X86FI->setSwiftAsyncContextFrameIdx(
27752 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27753 false));
27754 SDValue Result =
27755 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27756 PtrSize == 8 ? MVT::i64 : MVT::i32);
27757 // Return { result, chain }.
27758 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27759 Op->getOperand(0));
27760 }
27761 }
27762
27763 case llvm::Intrinsic::x86_seh_ehregnode:
27764 return MarkEHRegistrationNode(Op, DAG);
27765 case llvm::Intrinsic::x86_seh_ehguard:
27766 return MarkEHGuard(Op, DAG);
27767 case llvm::Intrinsic::x86_rdpkru: {
27768 SDLoc dl(Op);
27769 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27770 // Create a RDPKRU node and pass 0 to the ECX parameter.
27771 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27772 DAG.getConstant(0, dl, MVT::i32));
27773 }
27774 case llvm::Intrinsic::x86_wrpkru: {
27775 SDLoc dl(Op);
27776 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27777 // to the EDX and ECX parameters.
27778 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27779 Op.getOperand(0), Op.getOperand(2),
27780 DAG.getConstant(0, dl, MVT::i32),
27781 DAG.getConstant(0, dl, MVT::i32));
27782 }
27783 case llvm::Intrinsic::asan_check_memaccess: {
27784 // Mark this as adjustsStack because it will be lowered to a call.
27786 // Don't do anything here, we will expand these intrinsics out later.
27787 return Op;
27788 }
27789 case llvm::Intrinsic::x86_flags_read_u32:
27790 case llvm::Intrinsic::x86_flags_read_u64:
27791 case llvm::Intrinsic::x86_flags_write_u32:
27792 case llvm::Intrinsic::x86_flags_write_u64: {
27793 // We need a frame pointer because this will get lowered to a PUSH/POP
27794 // sequence.
27797 // Don't do anything here, we will expand these intrinsics out later
27798 // during FinalizeISel in EmitInstrWithCustomInserter.
27799 return Op;
27800 }
27801 case Intrinsic::x86_lwpins32:
27802 case Intrinsic::x86_lwpins64:
27803 case Intrinsic::x86_umwait:
27804 case Intrinsic::x86_tpause: {
27805 SDLoc dl(Op);
27806 SDValue Chain = Op->getOperand(0);
27807 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27808 unsigned Opcode;
27809
27810 switch (IntNo) {
27811 default: llvm_unreachable("Impossible intrinsic");
27812 case Intrinsic::x86_umwait:
27813 Opcode = X86ISD::UMWAIT;
27814 break;
27815 case Intrinsic::x86_tpause:
27816 Opcode = X86ISD::TPAUSE;
27817 break;
27818 case Intrinsic::x86_lwpins32:
27819 case Intrinsic::x86_lwpins64:
27820 Opcode = X86ISD::LWPINS;
27821 break;
27822 }
27823
27825 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27826 Op->getOperand(3), Op->getOperand(4));
27827 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27828 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27829 Operation.getValue(1));
27830 }
27831 case Intrinsic::x86_enqcmd:
27832 case Intrinsic::x86_enqcmds: {
27833 SDLoc dl(Op);
27834 SDValue Chain = Op.getOperand(0);
27835 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27836 unsigned Opcode;
27837 switch (IntNo) {
27838 default: llvm_unreachable("Impossible intrinsic!");
27839 case Intrinsic::x86_enqcmd:
27840 Opcode = X86ISD::ENQCMD;
27841 break;
27842 case Intrinsic::x86_enqcmds:
27843 Opcode = X86ISD::ENQCMDS;
27844 break;
27845 }
27846 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27847 Op.getOperand(3));
27848 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27849 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27850 Operation.getValue(1));
27851 }
27852 case Intrinsic::x86_aesenc128kl:
27853 case Intrinsic::x86_aesdec128kl:
27854 case Intrinsic::x86_aesenc256kl:
27855 case Intrinsic::x86_aesdec256kl: {
27856 SDLoc DL(Op);
27857 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27858 SDValue Chain = Op.getOperand(0);
27859 unsigned Opcode;
27860
27861 switch (IntNo) {
27862 default: llvm_unreachable("Impossible intrinsic");
27863 case Intrinsic::x86_aesenc128kl:
27864 Opcode = X86ISD::AESENC128KL;
27865 break;
27866 case Intrinsic::x86_aesdec128kl:
27867 Opcode = X86ISD::AESDEC128KL;
27868 break;
27869 case Intrinsic::x86_aesenc256kl:
27870 Opcode = X86ISD::AESENC256KL;
27871 break;
27872 case Intrinsic::x86_aesdec256kl:
27873 Opcode = X86ISD::AESDEC256KL;
27874 break;
27875 }
27876
27878 MachineMemOperand *MMO = MemIntr->getMemOperand();
27879 EVT MemVT = MemIntr->getMemoryVT();
27881 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27882 MMO);
27883 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27884
27885 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27886 {ZF, Operation.getValue(0), Operation.getValue(2)});
27887 }
27888 case Intrinsic::x86_aesencwide128kl:
27889 case Intrinsic::x86_aesdecwide128kl:
27890 case Intrinsic::x86_aesencwide256kl:
27891 case Intrinsic::x86_aesdecwide256kl: {
27892 SDLoc DL(Op);
27893 SDVTList VTs = DAG.getVTList(
27894 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27895 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27896 SDValue Chain = Op.getOperand(0);
27897 unsigned Opcode;
27898
27899 switch (IntNo) {
27900 default: llvm_unreachable("Impossible intrinsic");
27901 case Intrinsic::x86_aesencwide128kl:
27902 Opcode = X86ISD::AESENCWIDE128KL;
27903 break;
27904 case Intrinsic::x86_aesdecwide128kl:
27905 Opcode = X86ISD::AESDECWIDE128KL;
27906 break;
27907 case Intrinsic::x86_aesencwide256kl:
27908 Opcode = X86ISD::AESENCWIDE256KL;
27909 break;
27910 case Intrinsic::x86_aesdecwide256kl:
27911 Opcode = X86ISD::AESDECWIDE256KL;
27912 break;
27913 }
27914
27916 MachineMemOperand *MMO = MemIntr->getMemOperand();
27917 EVT MemVT = MemIntr->getMemoryVT();
27919 Opcode, DL, VTs,
27920 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27921 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27922 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27923 MemVT, MMO);
27924 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27925
27926 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27927 {ZF, Operation.getValue(1), Operation.getValue(2),
27928 Operation.getValue(3), Operation.getValue(4),
27929 Operation.getValue(5), Operation.getValue(6),
27930 Operation.getValue(7), Operation.getValue(8),
27931 Operation.getValue(9)});
27932 }
27933 case Intrinsic::x86_testui: {
27934 SDLoc dl(Op);
27935 SDValue Chain = Op.getOperand(0);
27936 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27937 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27938 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27939 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27940 Operation.getValue(1));
27941 }
27942 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27943 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27944 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27945 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27946 case Intrinsic::x86_t2rpntlvwz0_internal:
27947 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27948 case Intrinsic::x86_t2rpntlvwz1_internal:
27949 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27950 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27952 unsigned IntNo = Op.getConstantOperandVal(1);
27953 unsigned Opc = 0;
27954 switch (IntNo) {
27955 default:
27956 llvm_unreachable("Unexpected intrinsic!");
27957 case Intrinsic::x86_t2rpntlvwz0_internal:
27958 Opc = X86::PT2RPNTLVWZ0V;
27959 break;
27960 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27961 Opc = X86::PT2RPNTLVWZ0T1V;
27962 break;
27963 case Intrinsic::x86_t2rpntlvwz1_internal:
27964 Opc = X86::PT2RPNTLVWZ1V;
27965 break;
27966 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27967 Opc = X86::PT2RPNTLVWZ1T1V;
27968 break;
27969 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27970 Opc = X86::PT2RPNTLVWZ0RSV;
27971 break;
27972 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27973 Opc = X86::PT2RPNTLVWZ0RST1V;
27974 break;
27975 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27976 Opc = X86::PT2RPNTLVWZ1RSV;
27977 break;
27978 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27979 Opc = X86::PT2RPNTLVWZ1RST1V;
27980 break;
27981 }
27982
27983 SDLoc DL(Op);
27984 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27985
27986 SDValue Ops[] = {Op.getOperand(2), // Row
27987 Op.getOperand(3), // Col0
27988 Op.getOperand(4), // Col1
27989 Op.getOperand(5), // Base
27990 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27991 Op.getOperand(6), // Index
27992 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27993 DAG.getRegister(0, MVT::i16), // Segment
27994 Op.getOperand(0)}; // Chain
27995
27996 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27997 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27998 SDValue(Res, 0));
27999 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
28000 SDValue(Res, 0));
28001 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28002 }
28003 case Intrinsic::x86_atomic_bts_rm:
28004 case Intrinsic::x86_atomic_btc_rm:
28005 case Intrinsic::x86_atomic_btr_rm: {
28006 SDLoc DL(Op);
28007 MVT VT = Op.getSimpleValueType();
28008 SDValue Chain = Op.getOperand(0);
28009 SDValue Op1 = Op.getOperand(2);
28010 SDValue Op2 = Op.getOperand(3);
28011 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28012 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28014 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28015 SDValue Res =
28016 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28017 {Chain, Op1, Op2}, VT, MMO);
28018 Chain = Res.getValue(1);
28019 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28020 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28021 }
28022 case Intrinsic::x86_atomic_bts:
28023 case Intrinsic::x86_atomic_btc:
28024 case Intrinsic::x86_atomic_btr: {
28025 SDLoc DL(Op);
28026 MVT VT = Op.getSimpleValueType();
28027 SDValue Chain = Op.getOperand(0);
28028 SDValue Op1 = Op.getOperand(2);
28029 SDValue Op2 = Op.getOperand(3);
28030 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28031 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28032 : X86ISD::LBTR;
28033 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28034 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28035 SDValue Res =
28036 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28037 {Chain, Op1, Op2, Size}, VT, MMO);
28038 Chain = Res.getValue(1);
28039 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28040 unsigned Imm = Op2->getAsZExtVal();
28041 if (Imm)
28042 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28043 DAG.getShiftAmountConstant(Imm, VT, DL));
28044 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28045 }
28046 case Intrinsic::x86_cmpccxadd32:
28047 case Intrinsic::x86_cmpccxadd64: {
28048 SDLoc DL(Op);
28049 SDValue Chain = Op.getOperand(0);
28050 SDValue Addr = Op.getOperand(2);
28051 SDValue Src1 = Op.getOperand(3);
28052 SDValue Src2 = Op.getOperand(4);
28053 SDValue CC = Op.getOperand(5);
28054 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28056 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28057 MVT::i32, MMO);
28058 return Operation;
28059 }
28060 case Intrinsic::x86_aadd32:
28061 case Intrinsic::x86_aadd64:
28062 case Intrinsic::x86_aand32:
28063 case Intrinsic::x86_aand64:
28064 case Intrinsic::x86_aor32:
28065 case Intrinsic::x86_aor64:
28066 case Intrinsic::x86_axor32:
28067 case Intrinsic::x86_axor64: {
28068 SDLoc DL(Op);
28069 SDValue Chain = Op.getOperand(0);
28070 SDValue Op1 = Op.getOperand(2);
28071 SDValue Op2 = Op.getOperand(3);
28072 MVT VT = Op2.getSimpleValueType();
28073 unsigned Opc = 0;
28074 switch (IntNo) {
28075 default:
28076 llvm_unreachable("Unknown Intrinsic");
28077 case Intrinsic::x86_aadd32:
28078 case Intrinsic::x86_aadd64:
28079 Opc = X86ISD::AADD;
28080 break;
28081 case Intrinsic::x86_aand32:
28082 case Intrinsic::x86_aand64:
28083 Opc = X86ISD::AAND;
28084 break;
28085 case Intrinsic::x86_aor32:
28086 case Intrinsic::x86_aor64:
28087 Opc = X86ISD::AOR;
28088 break;
28089 case Intrinsic::x86_axor32:
28090 case Intrinsic::x86_axor64:
28091 Opc = X86ISD::AXOR;
28092 break;
28093 }
28094 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28095 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28096 {Chain, Op1, Op2}, VT, MMO);
28097 }
28098 case Intrinsic::x86_atomic_add_cc:
28099 case Intrinsic::x86_atomic_sub_cc:
28100 case Intrinsic::x86_atomic_or_cc:
28101 case Intrinsic::x86_atomic_and_cc:
28102 case Intrinsic::x86_atomic_xor_cc: {
28103 SDLoc DL(Op);
28104 SDValue Chain = Op.getOperand(0);
28105 SDValue Op1 = Op.getOperand(2);
28106 SDValue Op2 = Op.getOperand(3);
28107 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28108 MVT VT = Op2.getSimpleValueType();
28109 unsigned Opc = 0;
28110 switch (IntNo) {
28111 default:
28112 llvm_unreachable("Unknown Intrinsic");
28113 case Intrinsic::x86_atomic_add_cc:
28114 Opc = X86ISD::LADD;
28115 break;
28116 case Intrinsic::x86_atomic_sub_cc:
28117 Opc = X86ISD::LSUB;
28118 break;
28119 case Intrinsic::x86_atomic_or_cc:
28120 Opc = X86ISD::LOR;
28121 break;
28122 case Intrinsic::x86_atomic_and_cc:
28123 Opc = X86ISD::LAND;
28124 break;
28125 case Intrinsic::x86_atomic_xor_cc:
28126 Opc = X86ISD::LXOR;
28127 break;
28128 }
28129 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28130 SDValue LockArith =
28131 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28132 {Chain, Op1, Op2}, VT, MMO);
28133 Chain = LockArith.getValue(1);
28134 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28135 }
28136 }
28137 return SDValue();
28138 }
28139
28140 SDLoc dl(Op);
28141 switch(IntrData->Type) {
28142 default: llvm_unreachable("Unknown Intrinsic Type");
28143 case RDSEED:
28144 case RDRAND: {
28145 // Emit the node with the right value type.
28146 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28147 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28148
28149 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28150 // Otherwise return the value from Rand, which is always 0, casted to i32.
28151 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28152 DAG.getConstant(1, dl, Op->getValueType(1)),
28153 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28154 SDValue(Result.getNode(), 1)};
28155 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28156
28157 // Return { result, isValid, chain }.
28158 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28159 SDValue(Result.getNode(), 2));
28160 }
28161 case GATHER_AVX2: {
28162 SDValue Chain = Op.getOperand(0);
28163 SDValue Src = Op.getOperand(2);
28164 SDValue Base = Op.getOperand(3);
28165 SDValue Index = Op.getOperand(4);
28166 SDValue Mask = Op.getOperand(5);
28167 SDValue Scale = Op.getOperand(6);
28168 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28169 Scale, Chain, Subtarget);
28170 }
28171 case GATHER: {
28172 //gather(v1, mask, index, base, scale);
28173 SDValue Chain = Op.getOperand(0);
28174 SDValue Src = Op.getOperand(2);
28175 SDValue Base = Op.getOperand(3);
28176 SDValue Index = Op.getOperand(4);
28177 SDValue Mask = Op.getOperand(5);
28178 SDValue Scale = Op.getOperand(6);
28179 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28180 Chain, Subtarget);
28181 }
28182 case SCATTER: {
28183 //scatter(base, mask, index, v1, scale);
28184 SDValue Chain = Op.getOperand(0);
28185 SDValue Base = Op.getOperand(2);
28186 SDValue Mask = Op.getOperand(3);
28187 SDValue Index = Op.getOperand(4);
28188 SDValue Src = Op.getOperand(5);
28189 SDValue Scale = Op.getOperand(6);
28190 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28191 Scale, Chain, Subtarget);
28192 }
28193 case PREFETCH: {
28194 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28195 assert((HintVal == 2 || HintVal == 3) &&
28196 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28197 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28198 SDValue Chain = Op.getOperand(0);
28199 SDValue Mask = Op.getOperand(2);
28200 SDValue Index = Op.getOperand(3);
28201 SDValue Base = Op.getOperand(4);
28202 SDValue Scale = Op.getOperand(5);
28203 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28204 Subtarget);
28205 }
28206 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28207 case RDTSC: {
28209 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28210 Results);
28211 return DAG.getMergeValues(Results, dl);
28212 }
28213 // Read Performance Monitoring Counters.
28214 case RDPMC:
28215 // Read Processor Register.
28216 case RDPRU:
28217 // GetExtended Control Register.
28218 case XGETBV: {
28220
28221 // RDPMC uses ECX to select the index of the performance counter to read.
28222 // RDPRU uses ECX to select the processor register to read.
28223 // XGETBV uses ECX to select the index of the XCR register to return.
28224 // The result is stored into registers EDX:EAX.
28225 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28226 Subtarget, Results);
28227 return DAG.getMergeValues(Results, dl);
28228 }
28229 // XTEST intrinsics.
28230 case XTEST: {
28231 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28232 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28233
28234 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28235 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28236 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28237 Ret, SDValue(InTrans.getNode(), 1));
28238 }
28241 case TRUNCATE_TO_MEM_VI32: {
28242 SDValue Mask = Op.getOperand(4);
28243 SDValue DataToTruncate = Op.getOperand(3);
28244 SDValue Addr = Op.getOperand(2);
28245 SDValue Chain = Op.getOperand(0);
28246
28248 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28249
28250 EVT MemVT = MemIntr->getMemoryVT();
28251
28252 uint16_t TruncationOp = IntrData->Opc0;
28253 switch (TruncationOp) {
28254 case X86ISD::VTRUNC: {
28255 if (isAllOnesConstant(Mask)) // return just a truncate store
28256 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28257 MemIntr->getMemOperand());
28258
28259 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28260 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28261 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28262
28263 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28264 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28265 true /* truncating */);
28266 }
28267 case X86ISD::VTRUNCUS:
28268 case X86ISD::VTRUNCS: {
28269 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28270 if (isAllOnesConstant(Mask))
28271 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28272 MemIntr->getMemOperand(), DAG);
28273
28274 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28275 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28276
28277 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28278 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28279 }
28280 default:
28281 llvm_unreachable("Unsupported truncstore intrinsic");
28282 }
28283 }
28284 case INTR_TYPE_CAST_MMX:
28285 return SDValue(); // handled in combineINTRINSIC_*
28286 }
28287}
28288
28289SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28290 SelectionDAG &DAG) const {
28291 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28292 MFI.setReturnAddressIsTaken(true);
28293
28294 unsigned Depth = Op.getConstantOperandVal(0);
28295 SDLoc dl(Op);
28296 EVT PtrVT = Op.getValueType();
28297
28298 if (Depth > 0) {
28299 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28300 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28301 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28302 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28303 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28304 MachinePointerInfo());
28305 }
28306
28307 // Just load the return address.
28308 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28309 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28310 MachinePointerInfo());
28311}
28312
28313SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28314 SelectionDAG &DAG) const {
28316 return getReturnAddressFrameIndex(DAG);
28317}
28318
28319SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28320 MachineFunction &MF = DAG.getMachineFunction();
28321 MachineFrameInfo &MFI = MF.getFrameInfo();
28322 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28323 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28324 EVT VT = Op.getValueType();
28325
28326 MFI.setFrameAddressIsTaken(true);
28327
28328 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28329 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28330 // is not possible to crawl up the stack without looking at the unwind codes
28331 // simultaneously.
28332 int FrameAddrIndex = FuncInfo->getFAIndex();
28333 if (!FrameAddrIndex) {
28334 // Set up a frame object for the return address.
28335 unsigned SlotSize = RegInfo->getSlotSize();
28336 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28337 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28338 FuncInfo->setFAIndex(FrameAddrIndex);
28339 }
28340 return DAG.getFrameIndex(FrameAddrIndex, VT);
28341 }
28342
28343 Register FrameReg =
28344 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28345 SDLoc dl(Op); // FIXME probably not meaningful
28346 unsigned Depth = Op.getConstantOperandVal(0);
28347 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28348 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28349 "Invalid Frame Register!");
28350 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28351 while (Depth--)
28352 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28353 MachinePointerInfo());
28354 return FrameAddr;
28355}
28356
28357// FIXME? Maybe this could be a TableGen attribute on some registers and
28358// this table could be generated automatically from RegInfo.
28360 const MachineFunction &MF) const {
28361 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28362
28364 .Case("esp", X86::ESP)
28365 .Case("rsp", X86::RSP)
28366 .Case("ebp", X86::EBP)
28367 .Case("rbp", X86::RBP)
28368 .Case("r14", X86::R14)
28369 .Case("r15", X86::R15)
28370 .Default(0);
28371
28372 if (Reg == X86::EBP || Reg == X86::RBP) {
28373 if (!TFI.hasFP(MF))
28374 report_fatal_error("register " + StringRef(RegName) +
28375 " is allocatable: function has no frame pointer");
28376#ifndef NDEBUG
28377 else {
28378 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28379 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28380 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28381 "Invalid Frame Register!");
28382 }
28383#endif
28384 }
28385
28386 return Reg;
28387}
28388
28389SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28390 SelectionDAG &DAG) const {
28391 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28392 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28393}
28394
28396 const Constant *PersonalityFn) const {
28397 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28398 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28399
28400 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28401}
28402
28404 const Constant *PersonalityFn) const {
28405 // Funclet personalities don't use selectors (the runtime does the selection).
28407 return X86::NoRegister;
28408 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28409}
28410
28412 return Subtarget.isTargetWin64();
28413}
28414
28415SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28416 SDValue Chain = Op.getOperand(0);
28417 SDValue Offset = Op.getOperand(1);
28418 SDValue Handler = Op.getOperand(2);
28419 SDLoc dl (Op);
28420
28421 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28422 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28423 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28424 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28425 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28426 "Invalid Frame Register!");
28427 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28428 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28429
28430 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28431 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28432 dl));
28433 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28434 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28435 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28436
28437 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28438 DAG.getRegister(StoreAddrReg, PtrVT));
28439}
28440
28441SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28442 SelectionDAG &DAG) const {
28443 SDLoc DL(Op);
28444 // If the subtarget is not 64bit, we may need the global base reg
28445 // after isel expand pseudo, i.e., after CGBR pass ran.
28446 // Therefore, ask for the GlobalBaseReg now, so that the pass
28447 // inserts the code for us in case we need it.
28448 // Otherwise, we will end up in a situation where we will
28449 // reference a virtual register that is not defined!
28450 if (!Subtarget.is64Bit()) {
28451 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28452 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28453 }
28454 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28455 DAG.getVTList(MVT::i32, MVT::Other),
28456 Op.getOperand(0), Op.getOperand(1));
28457}
28458
28459SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28460 SelectionDAG &DAG) const {
28461 SDLoc DL(Op);
28462 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28463 Op.getOperand(0), Op.getOperand(1));
28464}
28465
28466SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28467 SelectionDAG &DAG) const {
28468 SDLoc DL(Op);
28469 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28470 Op.getOperand(0));
28471}
28472
28474 return Op.getOperand(0);
28475}
28476
28477SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28478 SelectionDAG &DAG) const {
28479 SDValue Root = Op.getOperand(0);
28480 SDValue Trmp = Op.getOperand(1); // trampoline
28481 SDValue FPtr = Op.getOperand(2); // nested function
28482 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28483 SDLoc dl (Op);
28484
28485 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28486 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28487
28488 if (Subtarget.is64Bit()) {
28489 SDValue OutChains[6];
28490
28491 // Large code-model.
28492 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28493 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28494
28495 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28496 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28497
28498 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28499
28500 // Load the pointer to the nested function into R11.
28501 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28502 SDValue Addr = Trmp;
28503 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28504 Addr, MachinePointerInfo(TrmpAddr));
28505
28506 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28507 DAG.getConstant(2, dl, MVT::i64));
28508 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28509 MachinePointerInfo(TrmpAddr, 2), Align(2));
28510
28511 // Load the 'nest' parameter value into R10.
28512 // R10 is specified in X86CallingConv.td
28513 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28514 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28515 DAG.getConstant(10, dl, MVT::i64));
28516 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28517 Addr, MachinePointerInfo(TrmpAddr, 10));
28518
28519 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28520 DAG.getConstant(12, dl, MVT::i64));
28521 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28522 MachinePointerInfo(TrmpAddr, 12), Align(2));
28523
28524 // Jump to the nested function.
28525 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28526 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28527 DAG.getConstant(20, dl, MVT::i64));
28528 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28529 Addr, MachinePointerInfo(TrmpAddr, 20));
28530
28531 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28532 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28533 DAG.getConstant(22, dl, MVT::i64));
28534 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28535 Addr, MachinePointerInfo(TrmpAddr, 22));
28536
28537 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28538 } else {
28539 const Function *Func =
28540 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28541 CallingConv::ID CC = Func->getCallingConv();
28542 unsigned NestReg;
28543
28544 switch (CC) {
28545 default:
28546 llvm_unreachable("Unsupported calling convention");
28547 case CallingConv::C:
28549 // Pass 'nest' parameter in ECX.
28550 // Must be kept in sync with X86CallingConv.td
28551 NestReg = X86::ECX;
28552
28553 // Check that ECX wasn't needed by an 'inreg' parameter.
28554 FunctionType *FTy = Func->getFunctionType();
28555 const AttributeList &Attrs = Func->getAttributes();
28556
28557 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28558 unsigned InRegCount = 0;
28559 unsigned Idx = 0;
28560
28561 for (FunctionType::param_iterator I = FTy->param_begin(),
28562 E = FTy->param_end(); I != E; ++I, ++Idx)
28563 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28564 const DataLayout &DL = DAG.getDataLayout();
28565 // FIXME: should only count parameters that are lowered to integers.
28566 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28567 }
28568
28569 if (InRegCount > 2) {
28570 report_fatal_error("Nest register in use - reduce number of inreg"
28571 " parameters!");
28572 }
28573 }
28574 break;
28575 }
28578 case CallingConv::Fast:
28579 case CallingConv::Tail:
28581 // Pass 'nest' parameter in EAX.
28582 // Must be kept in sync with X86CallingConv.td
28583 NestReg = X86::EAX;
28584 break;
28585 }
28586
28587 SDValue OutChains[4];
28588 SDValue Addr, Disp;
28589
28590 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28591 DAG.getConstant(10, dl, MVT::i32));
28592 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28593
28594 // This is storing the opcode for MOV32ri.
28595 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28596 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28597 OutChains[0] =
28598 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28599 Trmp, MachinePointerInfo(TrmpAddr));
28600
28601 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28602 DAG.getConstant(1, dl, MVT::i32));
28603 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28604 MachinePointerInfo(TrmpAddr, 1), Align(1));
28605
28606 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28607 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28608 DAG.getConstant(5, dl, MVT::i32));
28609 OutChains[2] =
28610 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28611 MachinePointerInfo(TrmpAddr, 5), Align(1));
28612
28613 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28614 DAG.getConstant(6, dl, MVT::i32));
28615 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28616 MachinePointerInfo(TrmpAddr, 6), Align(1));
28617
28618 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28619 }
28620}
28621
28622SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28623 SelectionDAG &DAG) const {
28624 /*
28625 The rounding mode is in bits 11:10 of FPSR, and has the following
28626 settings:
28627 00 Round to nearest
28628 01 Round to -inf
28629 10 Round to +inf
28630 11 Round to 0
28631
28632 GET_ROUNDING, on the other hand, expects the following:
28633 -1 Undefined
28634 0 Round to 0
28635 1 Round to nearest
28636 2 Round to +inf
28637 3 Round to -inf
28638
28639 To perform the conversion, we use a packed lookup table of the four 2-bit
28640 values that we can index by FPSP[11:10]
28641 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28642
28643 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28644 */
28645
28646 MachineFunction &MF = DAG.getMachineFunction();
28647 MVT VT = Op.getSimpleValueType();
28648 SDLoc DL(Op);
28649
28650 // Save FP Control Word to stack slot
28651 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28652 SDValue StackSlot =
28653 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28654
28655 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28656
28657 SDValue Chain = Op.getOperand(0);
28658 SDValue Ops[] = {Chain, StackSlot};
28660 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28662
28663 // Load FP Control Word from stack slot
28664 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28665 Chain = CWD.getValue(1);
28666
28667 // Mask and turn the control bits into a shift for the lookup table.
28668 SDValue Shift =
28669 DAG.getNode(ISD::SRL, DL, MVT::i16,
28670 DAG.getNode(ISD::AND, DL, MVT::i16,
28671 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28672 DAG.getConstant(9, DL, MVT::i8));
28673 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28674
28675 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28676 SDValue RetVal =
28677 DAG.getNode(ISD::AND, DL, MVT::i32,
28678 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28679 DAG.getConstant(3, DL, MVT::i32));
28680
28681 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28682
28683 return DAG.getMergeValues({RetVal, Chain}, DL);
28684}
28685
28686SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28687 SelectionDAG &DAG) const {
28688 MachineFunction &MF = DAG.getMachineFunction();
28689 SDLoc DL(Op);
28690 SDValue Chain = Op.getNode()->getOperand(0);
28691
28692 // FP control word may be set only from data in memory. So we need to allocate
28693 // stack space to save/load FP control word.
28694 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28695 SDValue StackSlot =
28696 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28697 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28698 MachineMemOperand *MMO =
28700
28701 // Store FP control word into memory.
28702 SDValue Ops[] = {Chain, StackSlot};
28703 Chain = DAG.getMemIntrinsicNode(
28704 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28705
28706 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28707 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28708 Chain = CWD.getValue(1);
28709 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28710 DAG.getConstant(0xf3ff, DL, MVT::i16));
28711
28712 // Calculate new rounding mode.
28713 SDValue NewRM = Op.getNode()->getOperand(1);
28714 SDValue RMBits;
28715 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28716 uint64_t RM = CVal->getZExtValue();
28717 int FieldVal = X86::getRoundingModeX86(RM);
28718
28719 if (FieldVal == X86::rmInvalid) {
28720 FieldVal = X86::rmToNearest;
28721 LLVMContext &C = MF.getFunction().getContext();
28722 C.diagnose(DiagnosticInfoUnsupported(
28723 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28724 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28725 }
28726 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28727 } else {
28728 // Need to convert argument into bits of control word:
28729 // 0 Round to 0 -> 11
28730 // 1 Round to nearest -> 00
28731 // 2 Round to +inf -> 10
28732 // 3 Round to -inf -> 01
28733 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28734 // To make the conversion, put all these values into a value 0xc9 and shift
28735 // it left depending on the rounding mode:
28736 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28737 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28738 // ...
28739 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28740 SDValue ShiftValue =
28741 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28742 DAG.getNode(ISD::ADD, DL, MVT::i32,
28743 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28744 DAG.getConstant(1, DL, MVT::i8)),
28745 DAG.getConstant(4, DL, MVT::i32)));
28746 SDValue Shifted =
28747 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28748 ShiftValue);
28749 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28750 DAG.getConstant(0xc00, DL, MVT::i16));
28751 }
28752
28753 // Update rounding mode bits and store the new FP Control Word into stack.
28754 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28755 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28756
28757 // Load FP control word from the slot.
28758 SDValue OpsLD[] = {Chain, StackSlot};
28759 MachineMemOperand *MMOL =
28761 Chain = DAG.getMemIntrinsicNode(
28762 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28763
28764 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28765 // same way but in bits 14:13.
28766 if (Subtarget.hasSSE1()) {
28767 // Store MXCSR into memory.
28768 Chain = DAG.getNode(
28769 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28770 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28771 StackSlot);
28772
28773 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28774 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28775 Chain = CWD.getValue(1);
28776 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28777 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28778
28779 // Shift X87 RM bits from 11:10 to 14:13.
28780 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28781 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28782 DAG.getConstant(3, DL, MVT::i8));
28783
28784 // Update rounding mode bits and store the new FP Control Word into stack.
28785 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28786 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28787
28788 // Load MXCSR from the slot.
28789 Chain = DAG.getNode(
28790 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28791 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28792 StackSlot);
28793 }
28794
28795 return Chain;
28796}
28797
28798const unsigned X87StateSize = 28;
28799const unsigned FPStateSize = 32;
28800[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28801
28802SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28803 SelectionDAG &DAG) const {
28805 SDLoc DL(Op);
28806 SDValue Chain = Op->getOperand(0);
28807 SDValue Ptr = Op->getOperand(1);
28809 EVT MemVT = Node->getMemoryVT();
28811 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28812
28813 // Get x87 state, if it presents.
28814 if (Subtarget.hasX87()) {
28815 Chain =
28816 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28817 {Chain, Ptr}, MemVT, MMO);
28818
28819 // FNSTENV changes the exception mask, so load back the stored environment.
28820 MachineMemOperand::Flags NewFlags =
28823 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28824 Chain =
28825 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28826 {Chain, Ptr}, MemVT, MMO);
28827 }
28828
28829 // If target supports SSE, get MXCSR as well.
28830 if (Subtarget.hasSSE1()) {
28831 // Get pointer to the MXCSR location in memory.
28833 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28834 DAG.getConstant(X87StateSize, DL, PtrVT));
28835 // Store MXCSR into memory.
28836 Chain = DAG.getNode(
28837 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28838 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28839 MXCSRAddr);
28840 }
28841
28842 return Chain;
28843}
28844
28846 EVT MemVT, MachineMemOperand *MMO,
28847 SelectionDAG &DAG,
28848 const X86Subtarget &Subtarget) {
28849 // Set x87 state, if it presents.
28850 if (Subtarget.hasX87())
28851 Chain =
28852 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28853 {Chain, Ptr}, MemVT, MMO);
28854 // If target supports SSE, set MXCSR as well.
28855 if (Subtarget.hasSSE1()) {
28856 // Get pointer to the MXCSR location in memory.
28858 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28859 DAG.getConstant(X87StateSize, DL, PtrVT));
28860 // Load MXCSR from memory.
28861 Chain = DAG.getNode(
28862 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28863 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28864 MXCSRAddr);
28865 }
28866 return Chain;
28867}
28868
28869SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28870 SelectionDAG &DAG) const {
28871 SDLoc DL(Op);
28872 SDValue Chain = Op->getOperand(0);
28873 SDValue Ptr = Op->getOperand(1);
28875 EVT MemVT = Node->getMemoryVT();
28877 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28878 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28879}
28880
28881SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28882 SelectionDAG &DAG) const {
28883 MachineFunction &MF = DAG.getMachineFunction();
28884 SDLoc DL(Op);
28885 SDValue Chain = Op.getNode()->getOperand(0);
28886
28887 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28888 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28890
28891 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28892 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28893 // for compatibility with glibc.
28894 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28895 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28896 Constant *Zero = ConstantInt::get(ItemTy, 0);
28897 for (unsigned I = 0; I < 6; ++I)
28898 FPEnvVals.push_back(Zero);
28899
28900 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28901 // all exceptions, sets DAZ and FTZ to 0.
28902 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28903 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28904 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28905 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28906 MachinePointerInfo MPI =
28908 MachineMemOperand *MMO = MF.getMachineMemOperand(
28910
28911 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28912}
28913
28914// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28915uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28916 assert((Amt < 8) && "Shift/Rotation amount out of range");
28917 switch (Opcode) {
28918 case ISD::BITREVERSE:
28919 return 0x8040201008040201ULL;
28920 case ISD::SHL:
28921 return ((0x0102040810204080ULL >> (Amt)) &
28922 (0x0101010101010101ULL * (0xFF >> (Amt))));
28923 case ISD::SRL:
28924 return ((0x0102040810204080ULL << (Amt)) &
28925 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28926 case ISD::SRA:
28927 return (getGFNICtrlImm(ISD::SRL, Amt) |
28928 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28929 case ISD::ROTL:
28930 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28931 case ISD::ROTR:
28932 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28933 }
28934 llvm_unreachable("Unsupported GFNI opcode");
28935}
28936
28937// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28938SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28939 MVT VT, unsigned Amt = 0) {
28940 assert(VT.getVectorElementType() == MVT::i8 &&
28941 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28942 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28943 SmallVector<SDValue> MaskBits;
28944 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28945 uint64_t Bits = (Imm >> (I % 64)) & 255;
28946 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28947 }
28948 return DAG.getBuildVector(VT, DL, MaskBits);
28949}
28950
28951/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28952//
28953// i8/i16 vector implemented using dword LZCNT vector instruction
28954// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28955// split the vector, perform operation on it's Lo a Hi part and
28956// concatenate the results.
28958 const X86Subtarget &Subtarget) {
28959 assert(Op.getOpcode() == ISD::CTLZ);
28960 SDLoc dl(Op);
28961 MVT VT = Op.getSimpleValueType();
28962 MVT EltVT = VT.getVectorElementType();
28963 unsigned NumElems = VT.getVectorNumElements();
28964
28965 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28966 "Unsupported element type");
28967
28968 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28969 if (NumElems > 16 ||
28970 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28971 return splitVectorIntUnary(Op, DAG, dl);
28972
28973 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28974 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28975 "Unsupported value type for operation");
28976
28977 // Use native supported vector instruction vplzcntd.
28978 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28979 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28980 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28981 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28982
28983 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28984}
28985
28986// Lower CTLZ using a PSHUFB lookup table implementation.
28988 const X86Subtarget &Subtarget,
28989 SelectionDAG &DAG) {
28990 MVT VT = Op.getSimpleValueType();
28991 int NumElts = VT.getVectorNumElements();
28992 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28993 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28994
28995 // Per-nibble leading zero PSHUFB lookup table.
28996 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28997 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28998 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28999 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29000
29002 for (int i = 0; i < NumBytes; ++i)
29003 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29004 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29005
29006 // Begin by bitcasting the input to byte vector, then split those bytes
29007 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29008 // If the hi input nibble is zero then we add both results together, otherwise
29009 // we just take the hi result (by masking the lo result to zero before the
29010 // add).
29011 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29012 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29013
29014 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29015 SDValue Lo = Op0;
29016 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29017 SDValue HiZ;
29018 if (CurrVT.is512BitVector()) {
29019 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29020 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29021 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29022 } else {
29023 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29024 }
29025
29026 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29027 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29028 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29029 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29030
29031 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29032 // of the current vector width in the same way we did for the nibbles.
29033 // If the upper half of the input element is zero then add the halves'
29034 // leading zero counts together, otherwise just use the upper half's.
29035 // Double the width of the result until we are at target width.
29036 while (CurrVT != VT) {
29037 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29038 int CurrNumElts = CurrVT.getVectorNumElements();
29039 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29040 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29041 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29042
29043 // Check if the upper half of the input element is zero.
29044 if (CurrVT.is512BitVector()) {
29045 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29046 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29047 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29048 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29049 } else {
29050 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29051 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29052 }
29053 HiZ = DAG.getBitcast(NextVT, HiZ);
29054
29055 // Move the upper/lower halves to the lower bits as we'll be extending to
29056 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29057 // together.
29058 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29059 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29060 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29061 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29062 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29063 CurrVT = NextVT;
29064 }
29065
29066 return Res;
29067}
29068
29070 const X86Subtarget &Subtarget,
29071 SelectionDAG &DAG) {
29072 MVT VT = Op.getSimpleValueType();
29073
29074 if (Subtarget.hasCDI() &&
29075 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29076 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29077 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29078
29079 // Decompose 256-bit ops into smaller 128-bit ops.
29080 if (VT.is256BitVector() && !Subtarget.hasInt256())
29081 return splitVectorIntUnary(Op, DAG, DL);
29082
29083 // Decompose 512-bit ops into smaller 256-bit ops.
29084 if (VT.is512BitVector() && !Subtarget.hasBWI())
29085 return splitVectorIntUnary(Op, DAG, DL);
29086
29087 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29088 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29089}
29090
29092 SelectionDAG &DAG,
29093 const X86Subtarget &Subtarget) {
29094 MVT VT = Op.getSimpleValueType();
29095 SDValue Input = Op.getOperand(0);
29096
29097 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29098 "Expected vXi8 input for GFNI-based CTLZ lowering");
29099
29100 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29101
29102 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29103 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29104
29105 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29106 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29107 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29108
29109 SDValue LZCNT =
29110 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29111 DAG.getTargetConstant(8, DL, MVT::i8));
29112 return LZCNT;
29113}
29114
29115static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29116 SelectionDAG &DAG) {
29117 MVT VT = Op.getSimpleValueType();
29118 MVT OpVT = VT;
29119 unsigned NumBits = VT.getSizeInBits();
29120 SDLoc dl(Op);
29121 unsigned Opc = Op.getOpcode();
29122
29123 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29124 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29125
29126 if (VT.isVector())
29127 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29128
29129 Op = Op.getOperand(0);
29130 if (VT == MVT::i8) {
29131 // Zero extend to i32 since there is not an i8 bsr.
29132 OpVT = MVT::i32;
29133 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29134 }
29135
29136 // Check if we can safely pass a result though BSR for zero sources.
29137 SDValue PassThru = DAG.getUNDEF(OpVT);
29138 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29139 !DAG.isKnownNeverZero(Op))
29140 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29141
29142 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29143 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29144 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29145
29146 // Skip CMOV if we're using a pass through value.
29147 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29148 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29149 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29150 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29151 Op.getValue(1)};
29152 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29153 }
29154
29155 // Finally xor with NumBits-1.
29156 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29157 DAG.getConstant(NumBits - 1, dl, OpVT));
29158
29159 if (VT == MVT::i8)
29160 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29161 return Op;
29162}
29163
29164static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29165 SelectionDAG &DAG) {
29166 MVT VT = Op.getSimpleValueType();
29167 unsigned NumBits = VT.getScalarSizeInBits();
29168 SDValue N0 = Op.getOperand(0);
29169 SDLoc dl(Op);
29170 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29171
29172 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29173 "Only scalar CTTZ requires custom lowering");
29174
29175 // Check if we can safely pass a result though BSF for zero sources.
29176 SDValue PassThru = DAG.getUNDEF(VT);
29177 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29178 PassThru = DAG.getConstant(NumBits, dl, VT);
29179
29180 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29181 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29182 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29183
29184 // Skip CMOV if src is never zero or we're using a pass through value.
29185 if (NonZeroSrc || !PassThru.isUndef())
29186 return Op;
29187
29188 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29189 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29190 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29191 Op.getValue(1)};
29192 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29193}
29194
29196 const X86Subtarget &Subtarget) {
29197 MVT VT = Op.getSimpleValueType();
29198 SDLoc DL(Op);
29199
29200 if (VT == MVT::i16 || VT == MVT::i32)
29201 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29202
29203 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29204 return splitVectorIntBinary(Op, DAG, DL);
29205
29206 assert(Op.getSimpleValueType().is256BitVector() &&
29207 Op.getSimpleValueType().isInteger() &&
29208 "Only handle AVX 256-bit vector integer operation");
29209 return splitVectorIntBinary(Op, DAG, DL);
29210}
29211
29213 const X86Subtarget &Subtarget) {
29214 MVT VT = Op.getSimpleValueType();
29215 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29216 unsigned Opcode = Op.getOpcode();
29217 SDLoc DL(Op);
29218
29219 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29220 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29221 assert(Op.getSimpleValueType().isInteger() &&
29222 "Only handle AVX vector integer operation");
29223 return splitVectorIntBinary(Op, DAG, DL);
29224 }
29225
29226 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29227 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29228 EVT SetCCResultType =
29229 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29230
29231 unsigned BitWidth = VT.getScalarSizeInBits();
29232 if (Opcode == ISD::USUBSAT) {
29233 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29234 // Handle a special-case with a bit-hack instead of cmp+select:
29235 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29236 // If the target can use VPTERNLOG, DAGToDAG will match this as
29237 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29238 // "broadcast" constant load.
29240 if (C && C->getAPIntValue().isSignMask()) {
29241 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29242 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29243 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29244 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29245 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29246 }
29247 }
29248 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29249 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29250 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29251 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29252 // TODO: Move this to DAGCombiner?
29253 if (SetCCResultType == VT &&
29254 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29255 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29256 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29257 }
29258 }
29259
29260 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29261 (!VT.isVector() || VT == MVT::v2i64)) {
29264 SDValue Zero = DAG.getConstant(0, DL, VT);
29265 SDValue Result =
29266 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29267 DAG.getVTList(VT, SetCCResultType), X, Y);
29268 SDValue SumDiff = Result.getValue(0);
29269 SDValue Overflow = Result.getValue(1);
29270 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29271 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29272 SDValue SumNeg =
29273 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29274 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29275 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29276 }
29277
29278 // Use default expansion.
29279 return SDValue();
29280}
29281
29282static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29283 SelectionDAG &DAG) {
29284 MVT VT = Op.getSimpleValueType();
29285 SDLoc DL(Op);
29286
29287 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29288 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29289 // 8-bit integer abs to NEG and CMOV.
29290 SDValue N0 = Op.getOperand(0);
29291 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29292 DAG.getConstant(0, DL, VT), N0);
29293 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29294 SDValue(Neg.getNode(), 1)};
29295 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29296 }
29297
29298 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29299 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29300 SDValue Src = Op.getOperand(0);
29301 SDValue Neg = DAG.getNegative(Src, DL, VT);
29302 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29303 }
29304
29305 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29306 assert(VT.isInteger() &&
29307 "Only handle AVX 256-bit vector integer operation");
29308 return splitVectorIntUnary(Op, DAG, DL);
29309 }
29310
29311 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29312 return splitVectorIntUnary(Op, DAG, DL);
29313
29314 // Default to expand.
29315 return SDValue();
29316}
29317
29318static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29319 SelectionDAG &DAG) {
29320 MVT VT = Op.getSimpleValueType();
29321 SDLoc DL(Op);
29322
29323 // For AVX1 cases, split to use legal ops.
29324 if (VT.is256BitVector() && !Subtarget.hasInt256())
29325 return splitVectorIntBinary(Op, DAG, DL);
29326
29327 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29328 return splitVectorIntBinary(Op, DAG, DL);
29329
29330 // Default to expand.
29331 return SDValue();
29332}
29333
29334static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29335 SelectionDAG &DAG) {
29336 MVT VT = Op.getSimpleValueType();
29337 SDLoc DL(Op);
29338
29339 // For AVX1 cases, split to use legal ops.
29340 if (VT.is256BitVector() && !Subtarget.hasInt256())
29341 return splitVectorIntBinary(Op, DAG, DL);
29342
29343 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29344 return splitVectorIntBinary(Op, DAG, DL);
29345
29346 // Default to expand.
29347 return SDValue();
29348}
29349
29351 SelectionDAG &DAG) {
29352 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29353 EVT VT = Op.getValueType();
29354 SDValue X = Op.getOperand(0);
29355 SDValue Y = Op.getOperand(1);
29356 SDLoc DL(Op);
29357 bool IsMaxOp =
29358 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29359 bool IsNum =
29360 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29361 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29362 unsigned Opc = 0;
29363 if (VT.isVector())
29365 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29367
29368 if (Opc) {
29369 SDValue Imm =
29370 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29371 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29372 }
29373 }
29374
29375 uint64_t SizeInBits = VT.getScalarSizeInBits();
29376 APInt PreferredZero = APInt::getZero(SizeInBits);
29377 APInt OppositeZero = PreferredZero;
29378 EVT IVT = VT.changeTypeToInteger();
29379 X86ISD::NodeType MinMaxOp;
29380 if (IsMaxOp) {
29381 MinMaxOp = X86ISD::FMAX;
29382 OppositeZero.setSignBit();
29383 } else {
29384 PreferredZero.setSignBit();
29385 MinMaxOp = X86ISD::FMIN;
29386 }
29387 EVT SetCCType =
29388 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29389
29390 // The tables below show the expected result of Max in cases of NaN and
29391 // signed zeros.
29392 //
29393 // Y Y
29394 // Num xNaN +0 -0
29395 // --------------- ---------------
29396 // Num | Max | Y | +0 | +0 | +0 |
29397 // X --------------- X ---------------
29398 // xNaN | X | X/Y | -0 | +0 | -0 |
29399 // --------------- ---------------
29400 //
29401 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29402 // reordering.
29403 //
29404 // We check if any of operands is NaN and return NaN. Then we check if any of
29405 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29406 // to ensure the correct zero is returned.
29407 auto MatchesZero = [](SDValue Op, APInt Zero) {
29409 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29410 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29411 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29412 return CstOp->getAPIntValue() == Zero;
29413 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29414 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29415 for (const SDValue &OpVal : Op->op_values()) {
29416 if (OpVal.isUndef())
29417 continue;
29418 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29419 if (!CstOp)
29420 return false;
29421 if (!CstOp->getValueAPF().isZero())
29422 continue;
29423 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29424 return false;
29425 }
29426 return true;
29427 }
29428 return false;
29429 };
29430
29431 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29432 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29433 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29434 Op->getFlags().hasNoSignedZeros() ||
29435 DAG.isKnownNeverZeroFloat(X) ||
29437 SDValue NewX, NewY;
29438 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29439 MatchesZero(X, OppositeZero)) {
29440 // Operands are already in right order or order does not matter.
29441 NewX = X;
29442 NewY = Y;
29443 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29444 NewX = Y;
29445 NewY = X;
29446 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29447 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29448 if (IsXNeverNaN)
29449 std::swap(X, Y);
29450 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29451 // xmm register.
29452 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29454 // Bits of classes:
29455 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29456 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29457 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29458 DL, MVT::i32);
29459 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29460 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29461 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29462 DAG.getVectorIdxConstant(0, DL));
29463 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29464 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29465 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29466 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29467 } else {
29468 SDValue IsXSigned;
29469 if (Subtarget.is64Bit() || VT != MVT::f64) {
29470 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29471 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29472 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29473 } else {
29474 assert(VT == MVT::f64);
29475 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29476 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29477 DAG.getVectorIdxConstant(0, DL));
29478 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29479 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29480 DAG.getVectorIdxConstant(1, DL));
29481 Hi = DAG.getBitcast(MVT::i32, Hi);
29482 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29483 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29484 *DAG.getContext(), MVT::i32);
29485 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29486 }
29487 if (MinMaxOp == X86ISD::FMAX) {
29488 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29489 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29490 } else {
29491 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29492 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29493 }
29494 }
29495
29496 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29497 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29498
29499 // If we did no ordering operands for signed zero handling and we need
29500 // to process NaN and we know that one of the operands is not NaN then:
29501 // - For minimum/maximum, put it in the first operand,
29502 // - For minimumnum/maximumnum, put it in the second operand,
29503 // and we will not need to post handle NaN after max/min.
29504 if (IgnoreSignedZero && !IgnoreNaN &&
29505 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29506 std::swap(NewX, NewY);
29507
29508 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29509
29510 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29511 return MinMax;
29512
29513 if (DAG.isKnownNeverNaN(NewX))
29514 NewX = NewY;
29515
29516 SDValue IsNaN =
29517 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29518
29519 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29520}
29521
29522static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29523 SelectionDAG &DAG) {
29524 MVT VT = Op.getSimpleValueType();
29525 SDLoc dl(Op);
29526
29527 // For AVX1 cases, split to use legal ops.
29528 if (VT.is256BitVector() && !Subtarget.hasInt256())
29529 return splitVectorIntBinary(Op, DAG, dl);
29530
29531 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29532 return splitVectorIntBinary(Op, DAG, dl);
29533
29534 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29535 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29536
29537 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29538 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29539 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29540
29541 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29542 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29543 if (VT.bitsGE(MVT::i32)) {
29544 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29545 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29546 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29547 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29548 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29549 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29550 DAG.getTargetConstant(CC, dl, MVT::i8),
29551 Diff1.getValue(1));
29552 }
29553
29554 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29555 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29556 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29557 MVT WideVT = MVT::getIntegerVT(WideBits);
29558 if (TLI.isTypeLegal(WideVT)) {
29559 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29560 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29561 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29562 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29563 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29564 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29565 DAG.getTargetConstant(CC, dl, MVT::i8),
29566 Diff1.getValue(1));
29567 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29568 }
29569 }
29570
29571 // Default to expand.
29572 return SDValue();
29573}
29574
29575static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29576 SelectionDAG &DAG) {
29577 SDLoc dl(Op);
29578 MVT VT = Op.getSimpleValueType();
29579
29580 // Decompose 256-bit ops into 128-bit ops.
29581 if (VT.is256BitVector() && !Subtarget.hasInt256())
29582 return splitVectorIntBinary(Op, DAG, dl);
29583
29584 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29585 return splitVectorIntBinary(Op, DAG, dl);
29586
29587 SDValue A = Op.getOperand(0);
29588 SDValue B = Op.getOperand(1);
29589
29590 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29591 // vector pairs, multiply and truncate.
29592 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29593 unsigned NumElts = VT.getVectorNumElements();
29594 unsigned NumLanes = VT.getSizeInBits() / 128;
29595 unsigned NumEltsPerLane = NumElts / NumLanes;
29596
29597 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29598 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29599 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29600 return DAG.getNode(
29601 ISD::TRUNCATE, dl, VT,
29602 DAG.getNode(ISD::MUL, dl, ExVT,
29603 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29604 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29605 }
29606
29607 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29608
29609 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29610 // Don't do this if we only need to unpack one half.
29611 if (Subtarget.hasSSSE3()) {
29612 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29613 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29614 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29615 if (BIsBuildVector) {
29616 for (auto [Idx, Val] : enumerate(B->ops())) {
29617 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29618 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29619 else
29620 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29621 }
29622 }
29623 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29624 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29625 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29626 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29627 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29628 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29629 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29630 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29631 DAG.getTargetConstant(8, dl, MVT::i8));
29632 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29633 }
29634 }
29635
29636 // Extract the lo/hi parts to any extend to i16.
29637 // We're going to mask off the low byte of each result element of the
29638 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29639 // element.
29640 SDValue Undef = DAG.getUNDEF(VT);
29641 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29642 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29643
29644 SDValue BLo, BHi;
29645 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29646 // If the RHS is a constant, manually unpackl/unpackh.
29647 SmallVector<SDValue, 16> LoOps, HiOps;
29648 for (unsigned i = 0; i != NumElts; i += 16) {
29649 for (unsigned j = 0; j != 8; ++j) {
29650 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29651 MVT::i16));
29652 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29653 MVT::i16));
29654 }
29655 }
29656
29657 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29658 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29659 } else {
29660 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29661 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29662 }
29663
29664 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29665 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29666 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29667 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29668 }
29669
29670 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29671 if (VT == MVT::v4i32) {
29672 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29673 "Should not custom lower when pmulld is available!");
29674
29675 // Extract the odd parts.
29676 static const int UnpackMask[] = {1, 1, 3, 3};
29677 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29678 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29679
29680 // Multiply the even parts.
29681 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29682 DAG.getBitcast(MVT::v2i64, A),
29683 DAG.getBitcast(MVT::v2i64, B));
29684 // Now multiply odd parts.
29685 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29686 DAG.getBitcast(MVT::v2i64, Aodds),
29687 DAG.getBitcast(MVT::v2i64, Bodds));
29688
29689 Evens = DAG.getBitcast(VT, Evens);
29690 Odds = DAG.getBitcast(VT, Odds);
29691
29692 // Merge the two vectors back together with a shuffle. This expands into 2
29693 // shuffles.
29694 static const int ShufMask[] = { 0, 4, 2, 6 };
29695 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29696 }
29697
29698 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29699 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29700 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29701
29702 // Ahi = psrlqi(a, 32);
29703 // Bhi = psrlqi(b, 32);
29704 //
29705 // AloBlo = pmuludq(a, b);
29706 // AloBhi = pmuludq(a, Bhi);
29707 // AhiBlo = pmuludq(Ahi, b);
29708 //
29709 // Hi = psllqi(AloBhi + AhiBlo, 32);
29710 // return AloBlo + Hi;
29711 KnownBits AKnown = DAG.computeKnownBits(A);
29712 KnownBits BKnown = DAG.computeKnownBits(B);
29713
29714 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29715 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29716 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29717
29718 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29719 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29720 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29721
29722 SDValue Zero = DAG.getConstant(0, dl, VT);
29723
29724 // Only multiply lo/hi halves that aren't known to be zero.
29725 SDValue AloBlo = Zero;
29726 if (!ALoIsZero && !BLoIsZero)
29727 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29728
29729 SDValue AloBhi = Zero;
29730 if (!ALoIsZero && !BHiIsZero) {
29731 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29732 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29733 }
29734
29735 SDValue AhiBlo = Zero;
29736 if (!AHiIsZero && !BLoIsZero) {
29737 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29738 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29739 }
29740
29741 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29742 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29743
29744 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29745}
29746
29748 MVT VT, bool IsSigned,
29749 const X86Subtarget &Subtarget,
29750 SelectionDAG &DAG,
29751 SDValue *Low = nullptr) {
29752 unsigned NumElts = VT.getVectorNumElements();
29753
29754 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29755 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29756 // lane results back together.
29757
29758 // We'll take different approaches for signed and unsigned.
29759 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29760 // and use pmullw to calculate the full 16-bit product.
29761 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29762 // shift them left into the upper byte of each word. This allows us to use
29763 // pmulhw to calculate the full 16-bit product. This trick means we don't
29764 // need to sign extend the bytes to use pmullw.
29765
29766 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29767 SDValue Zero = DAG.getConstant(0, dl, VT);
29768
29769 SDValue ALo, AHi;
29770 if (IsSigned) {
29771 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29772 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29773 } else {
29774 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29775 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29776 }
29777
29778 SDValue BLo, BHi;
29779 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29780 // If the RHS is a constant, manually unpackl/unpackh and extend.
29781 SmallVector<SDValue, 16> LoOps, HiOps;
29782 for (unsigned i = 0; i != NumElts; i += 16) {
29783 for (unsigned j = 0; j != 8; ++j) {
29784 SDValue LoOp = B.getOperand(i + j);
29785 SDValue HiOp = B.getOperand(i + j + 8);
29786
29787 if (IsSigned) {
29788 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29789 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29790 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29791 DAG.getConstant(8, dl, MVT::i16));
29792 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29793 DAG.getConstant(8, dl, MVT::i16));
29794 } else {
29795 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29796 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29797 }
29798
29799 LoOps.push_back(LoOp);
29800 HiOps.push_back(HiOp);
29801 }
29802 }
29803
29804 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29805 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29806 } else if (IsSigned) {
29807 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29808 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29809 } else {
29810 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29811 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29812 }
29813
29814 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29815 // pack back to vXi8.
29816 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29817 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29818 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29819
29820 if (Low)
29821 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29822
29823 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29824}
29825
29826static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29827 SelectionDAG &DAG) {
29828 SDLoc dl(Op);
29829 MVT VT = Op.getSimpleValueType();
29830 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29831 unsigned NumElts = VT.getVectorNumElements();
29832 SDValue A = Op.getOperand(0);
29833 SDValue B = Op.getOperand(1);
29834
29835 // Decompose 256-bit ops into 128-bit ops.
29836 if (VT.is256BitVector() && !Subtarget.hasInt256())
29837 return splitVectorIntBinary(Op, DAG, dl);
29838
29839 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29840 return splitVectorIntBinary(Op, DAG, dl);
29841
29842 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29843 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29844 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29845 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29846
29847 // PMULxD operations multiply each even value (starting at 0) of LHS with
29848 // the related value of RHS and produce a widen result.
29849 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29850 // => <2 x i64> <ae|cg>
29851 //
29852 // In other word, to have all the results, we need to perform two PMULxD:
29853 // 1. one with the even values.
29854 // 2. one with the odd values.
29855 // To achieve #2, with need to place the odd values at an even position.
29856 //
29857 // Place the odd value at an even position (basically, shift all values 1
29858 // step to the left):
29859 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29860 9, -1, 11, -1, 13, -1, 15, -1};
29861 // <a|b|c|d> => <b|undef|d|undef>
29862 SDValue Odd0 =
29863 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29864 // <e|f|g|h> => <f|undef|h|undef>
29865 SDValue Odd1 =
29866 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29867
29868 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29869 // ints.
29870 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29871 unsigned Opcode =
29872 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29873 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29874 // => <2 x i64> <ae|cg>
29875 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29876 DAG.getBitcast(MulVT, A),
29877 DAG.getBitcast(MulVT, B)));
29878 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29879 // => <2 x i64> <bf|dh>
29880 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29881 DAG.getBitcast(MulVT, Odd0),
29882 DAG.getBitcast(MulVT, Odd1)));
29883
29884 // Shuffle it back into the right order.
29885 SmallVector<int, 16> ShufMask(NumElts);
29886 for (int i = 0; i != (int)NumElts; ++i)
29887 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29888
29889 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29890
29891 // If we have a signed multiply but no PMULDQ fix up the result of an
29892 // unsigned multiply.
29893 if (IsSigned && !Subtarget.hasSSE41()) {
29894 SDValue Zero = DAG.getConstant(0, dl, VT);
29895 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29896 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29897 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29898 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29899
29900 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29901 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29902 }
29903
29904 return Res;
29905 }
29906
29907 // Only i8 vectors should need custom lowering after this.
29908 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29909 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29910 "Unsupported vector type");
29911
29912 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29913 // logical shift down the upper half and pack back to i8.
29914
29915 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29916 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29917
29918 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29919 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29920 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29921 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29922 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29923 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29924 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29925 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29926 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29927 }
29928
29929 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29930}
29931
29932// Custom lowering for SMULO/UMULO.
29933static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29934 SelectionDAG &DAG) {
29935 MVT VT = Op.getSimpleValueType();
29936
29937 // Scalars defer to LowerXALUO.
29938 if (!VT.isVector())
29939 return LowerXALUO(Op, DAG);
29940
29941 SDLoc dl(Op);
29942 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29943 SDValue A = Op.getOperand(0);
29944 SDValue B = Op.getOperand(1);
29945 EVT OvfVT = Op->getValueType(1);
29946
29947 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29948 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29949 // Extract the LHS Lo/Hi vectors
29950 SDValue LHSLo, LHSHi;
29951 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29952
29953 // Extract the RHS Lo/Hi vectors
29954 SDValue RHSLo, RHSHi;
29955 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29956
29957 EVT LoOvfVT, HiOvfVT;
29958 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29959 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29960 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29961
29962 // Issue the split operations.
29963 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29964 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29965
29966 // Join the separate data results and the overflow results.
29967 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29968 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29969 Hi.getValue(1));
29970
29971 return DAG.getMergeValues({Res, Ovf}, dl);
29972 }
29973
29974 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29975 EVT SetccVT =
29976 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29977
29978 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29979 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29980 unsigned NumElts = VT.getVectorNumElements();
29981 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29982 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29983 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29984 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29985 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29986
29987 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29988
29989 SDValue Ovf;
29990 if (IsSigned) {
29991 SDValue High, LowSign;
29992 if (OvfVT.getVectorElementType() == MVT::i1 &&
29993 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29994 // Rather the truncating try to do the compare on vXi16 or vXi32.
29995 // Shift the high down filling with sign bits.
29996 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29997 // Fill all 16 bits with the sign bit from the low.
29998 LowSign =
29999 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30000 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30001 15, DAG);
30002 SetccVT = OvfVT;
30003 if (!Subtarget.hasBWI()) {
30004 // We can't do a vXi16 compare so sign extend to v16i32.
30005 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30006 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30007 }
30008 } else {
30009 // Otherwise do the compare at vXi8.
30010 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30011 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30012 LowSign =
30013 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30014 }
30015
30016 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30017 } else {
30018 SDValue High =
30019 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30020 if (OvfVT.getVectorElementType() == MVT::i1 &&
30021 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30022 // Rather the truncating try to do the compare on vXi16 or vXi32.
30023 SetccVT = OvfVT;
30024 if (!Subtarget.hasBWI()) {
30025 // We can't do a vXi16 compare so sign extend to v16i32.
30026 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30027 }
30028 } else {
30029 // Otherwise do the compare at vXi8.
30030 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30031 }
30032
30033 Ovf =
30034 DAG.getSetCC(dl, SetccVT, High,
30035 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30036 }
30037
30038 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30039
30040 return DAG.getMergeValues({Low, Ovf}, dl);
30041 }
30042
30043 SDValue Low;
30044 SDValue High =
30045 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30046
30047 SDValue Ovf;
30048 if (IsSigned) {
30049 // SMULO overflows if the high bits don't match the sign of the low.
30050 SDValue LowSign =
30051 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30052 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30053 } else {
30054 // UMULO overflows if the high bits are non-zero.
30055 Ovf =
30056 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30057 }
30058
30059 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30060
30061 return DAG.getMergeValues({Low, Ovf}, dl);
30062}
30063
30064SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30065 assert(Subtarget.isTargetWin64() && "Unexpected target");
30066 EVT VT = Op.getValueType();
30067 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30068 "Unexpected return type for lowering");
30069
30070 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30072 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30073 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30074 }
30075
30076 RTLIB::Libcall LC;
30077 bool isSigned;
30078 switch (Op->getOpcode()) {
30079 // clang-format off
30080 default: llvm_unreachable("Unexpected request for libcall!");
30081 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30082 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30083 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30084 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30085 // clang-format on
30086 }
30087
30088 SDLoc dl(Op);
30089 SDValue InChain = DAG.getEntryNode();
30090
30092 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30093 EVT ArgVT = Op->getOperand(i).getValueType();
30094 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30095 "Unexpected argument type for lowering");
30096 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30097 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30098 MachinePointerInfo MPI =
30100 InChain =
30101 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30102 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30103 }
30104
30107
30108 TargetLowering::CallLoweringInfo CLI(DAG);
30109 CLI.setDebugLoc(dl)
30110 .setChain(InChain)
30111 .setLibCallee(
30113 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30114 std::move(Args))
30115 .setInRegister()
30116 .setSExtResult(isSigned)
30117 .setZExtResult(!isSigned);
30118
30119 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30120 return DAG.getBitcast(VT, CallInfo.first);
30121}
30122
30123SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30124 SelectionDAG &DAG,
30125 SDValue &Chain) const {
30126 assert(Subtarget.isTargetWin64() && "Unexpected target");
30127 EVT VT = Op.getValueType();
30128 bool IsStrict = Op->isStrictFPOpcode();
30129
30130 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30131 EVT ArgVT = Arg.getValueType();
30132
30133 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30134 "Unexpected return type for lowering");
30135
30136 RTLIB::Libcall LC;
30137 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30138 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30139 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30140 else
30141 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30142 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30143
30144 SDLoc dl(Op);
30145 MakeLibCallOptions CallOptions;
30146 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30147
30149 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30150 // expected VT (i128).
30151 std::tie(Result, Chain) =
30152 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30153 Result = DAG.getBitcast(VT, Result);
30154 return Result;
30155}
30156
30157SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30158 SelectionDAG &DAG) const {
30159 assert(Subtarget.isTargetWin64() && "Unexpected target");
30160 EVT VT = Op.getValueType();
30161 bool IsStrict = Op->isStrictFPOpcode();
30162
30163 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30164 EVT ArgVT = Arg.getValueType();
30165
30166 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30167 "Unexpected argument type for lowering");
30168
30169 RTLIB::Libcall LC;
30170 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30171 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30172 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30173 else
30174 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30175 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30176
30177 SDLoc dl(Op);
30178 MakeLibCallOptions CallOptions;
30179 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30180
30181 // Pass the i128 argument as an indirect argument on the stack.
30182 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30183 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30184 MachinePointerInfo MPI =
30186 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30187
30189 std::tie(Result, Chain) =
30190 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30191 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30192}
30193
30194// Return true if the required (according to Opcode) shift-imm form is natively
30195// supported by the Subtarget
30196static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30197 unsigned Opcode) {
30198 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30199 "Unexpected shift opcode");
30200
30201 if (!VT.isSimple())
30202 return false;
30203
30204 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30205 return false;
30206
30207 if (VT.getScalarSizeInBits() < 16)
30208 return false;
30209
30210 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30211 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30212 return true;
30213
30214 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30215 (VT.is256BitVector() && Subtarget.hasInt256());
30216
30217 bool AShift = LShift && (Subtarget.hasAVX512() ||
30218 (VT != MVT::v2i64 && VT != MVT::v4i64));
30219 return (Opcode == ISD::SRA) ? AShift : LShift;
30220}
30221
30222// The shift amount is a variable, but it is the same for all vector lanes.
30223// These instructions are defined together with shift-immediate.
30224static
30226 unsigned Opcode) {
30227 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30228}
30229
30230// Return true if the required (according to Opcode) variable-shift form is
30231// natively supported by the Subtarget
30232static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30233 unsigned Opcode) {
30234 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30235 "Unexpected shift opcode");
30236
30237 if (!VT.isSimple())
30238 return false;
30239
30240 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30241 return false;
30242
30243 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30244 return false;
30245
30246 // vXi16 supported only on AVX-512, BWI
30247 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30248 return false;
30249
30250 if (Subtarget.hasAVX512() &&
30251 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30252 return true;
30253
30254 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30255 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30256 return (Opcode == ISD::SRA) ? AShift : LShift;
30257}
30258
30260 const X86Subtarget &Subtarget) {
30261 MVT VT = Op.getSimpleValueType();
30262 SDLoc dl(Op);
30263 SDValue R = Op.getOperand(0);
30264 SDValue Amt = Op.getOperand(1);
30265 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30266 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30267
30268 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30269 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30270 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30271 SDValue Ex = DAG.getBitcast(ExVT, R);
30272
30273 // ashr(R, 63) === cmp_slt(R, 0)
30274 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30275 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30276 "Unsupported PCMPGT op");
30277 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30278 }
30279
30280 if (ShiftAmt >= 32) {
30281 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30282 SDValue Upper =
30283 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30285 ShiftAmt - 32, DAG);
30286 if (VT == MVT::v2i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30288 if (VT == MVT::v4i64)
30289 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30290 {9, 1, 11, 3, 13, 5, 15, 7});
30291 } else {
30292 // SRA upper i32, SRL whole i64 and select lower i32.
30294 ShiftAmt, DAG);
30295 SDValue Lower =
30296 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30297 Lower = DAG.getBitcast(ExVT, Lower);
30298 if (VT == MVT::v2i64)
30299 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30300 if (VT == MVT::v4i64)
30301 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30302 {8, 1, 10, 3, 12, 5, 14, 7});
30303 }
30304 return DAG.getBitcast(VT, Ex);
30305 };
30306
30307 // Optimize shl/srl/sra with constant shift amount.
30308 APInt APIntShiftAmt;
30309 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30310 return SDValue();
30311
30312 // If the shift amount is out of range, return undef.
30313 if (APIntShiftAmt.uge(EltSizeInBits))
30314 return DAG.getUNDEF(VT);
30315
30316 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30317
30318 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30319 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30320
30321 // i64 SRA needs to be performed as partial shifts.
30322 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30323 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30324 Op.getOpcode() == ISD::SRA)
30325 return ArithmeticShiftRight64(ShiftAmt);
30326
30327 // If we're logical shifting an all-signbits value then we can just perform as
30328 // a mask.
30329 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30330 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30331 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30332 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30333 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30334 }
30335
30336 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30337 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30338 unsigned NumElts = VT.getVectorNumElements();
30339 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30340
30341 // Simple i8 add case
30342 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30343 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30344 // must be 0). (add undef, undef) however can be any value. To make this
30345 // safe, we must freeze R to ensure that register allocation uses the same
30346 // register for an undefined value. This ensures that the result will
30347 // still be even and preserves the original semantics.
30348 R = DAG.getFreeze(R);
30349 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30350 }
30351
30352 // ashr(R, 7) === cmp_slt(R, 0)
30353 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30354 SDValue Zeros = DAG.getConstant(0, dl, VT);
30355 if (VT.is512BitVector()) {
30356 assert(VT == MVT::v64i8 && "Unexpected element type!");
30357 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30358 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30359 }
30360 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30361 }
30362
30363 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30364 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30365 return SDValue();
30366
30367 if (Subtarget.hasGFNI()) {
30368 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30369 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30370 DAG.getTargetConstant(0, dl, MVT::i8));
30371 }
30372
30373 if (Op.getOpcode() == ISD::SHL) {
30374 // Make a large shift.
30375 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30376 ShiftAmt, DAG);
30377 SHL = DAG.getBitcast(VT, SHL);
30378 // Zero out the rightmost bits.
30379 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30380 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30381 }
30382 if (Op.getOpcode() == ISD::SRL) {
30383 // Make a large shift.
30384 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30385 ShiftAmt, DAG);
30386 SRL = DAG.getBitcast(VT, SRL);
30387 // Zero out the leftmost bits.
30388 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30389 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30390 }
30391 if (Op.getOpcode() == ISD::SRA) {
30392 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30393 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30394
30395 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30396 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30397 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30398 return Res;
30399 }
30400 llvm_unreachable("Unknown shift opcode.");
30401 }
30402
30403 return SDValue();
30404}
30405
30407 const X86Subtarget &Subtarget) {
30408 MVT VT = Op.getSimpleValueType();
30409 SDLoc dl(Op);
30410 SDValue R = Op.getOperand(0);
30411 SDValue Amt = Op.getOperand(1);
30412 unsigned Opcode = Op.getOpcode();
30413 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30414
30415 int BaseShAmtIdx = -1;
30416 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30417 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30418 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30419 Subtarget, DAG);
30420
30421 // vXi8 shifts - shift as v8i16 + mask result.
30422 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30423 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30424 VT == MVT::v64i8) &&
30425 !Subtarget.hasXOP()) {
30426 unsigned NumElts = VT.getVectorNumElements();
30427 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30428 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30429 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30430 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30431
30432 // Create the mask using vXi16 shifts. For shift-rights we need to move
30433 // the upper byte down before splatting the vXi8 mask.
30434 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30435 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30436 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30437 if (Opcode != ISD::SHL)
30438 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30439 8, DAG);
30440 BitMask = DAG.getBitcast(VT, BitMask);
30441 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30442 SmallVector<int, 64>(NumElts, 0));
30443
30444 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30445 DAG.getBitcast(ExtVT, R), BaseShAmt,
30446 BaseShAmtIdx, Subtarget, DAG);
30447 Res = DAG.getBitcast(VT, Res);
30448 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30449
30450 if (Opcode == ISD::SRA) {
30451 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30452 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30453 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30454 SignMask =
30455 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30456 BaseShAmtIdx, Subtarget, DAG);
30457 SignMask = DAG.getBitcast(VT, SignMask);
30458 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30459 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30460 }
30461 return Res;
30462 }
30463 }
30464 }
30465
30466 return SDValue();
30467}
30468
30469// Convert a shift/rotate left amount to a multiplication scale factor.
30471 const X86Subtarget &Subtarget,
30472 SelectionDAG &DAG) {
30473 MVT VT = Amt.getSimpleValueType();
30474 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30475 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30476 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30477 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30478 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30479 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30480 return SDValue();
30481
30482 MVT SVT = VT.getVectorElementType();
30483 unsigned SVTBits = SVT.getSizeInBits();
30484 unsigned NumElems = VT.getVectorNumElements();
30485
30486 APInt UndefElts;
30487 SmallVector<APInt> EltBits;
30488 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30489 APInt One(SVTBits, 1);
30490 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30491 for (unsigned I = 0; I != NumElems; ++I) {
30492 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30493 continue;
30494 uint64_t ShAmt = EltBits[I].getZExtValue();
30495 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30496 }
30497 return DAG.getBuildVector(VT, dl, Elts);
30498 }
30499
30500 // If the target doesn't support variable shifts, use either FP conversion
30501 // or integer multiplication to avoid shifting each element individually.
30502 if (VT == MVT::v4i32) {
30503 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30504 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30505 DAG.getConstant(0x3f800000U, dl, VT));
30506 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30507 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30508 }
30509
30510 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30511 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30512 SDValue Z = DAG.getConstant(0, dl, VT);
30513 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30514 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30515 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30516 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30517 if (Subtarget.hasSSE41())
30518 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30519 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30520 }
30521
30522 return SDValue();
30523}
30524
30525static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30526 SelectionDAG &DAG) {
30527 MVT VT = Op.getSimpleValueType();
30528 SDLoc dl(Op);
30529 SDValue R = Op.getOperand(0);
30530 SDValue Amt = Op.getOperand(1);
30531 unsigned NumElts = VT.getVectorNumElements();
30532 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30533 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30534
30535 unsigned Opc = Op.getOpcode();
30536 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30537 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30538
30539 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30540 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30541
30542 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30543 return V;
30544
30545 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30546 return V;
30547
30548 if (supportedVectorVarShift(VT, Subtarget, Opc))
30549 return Op;
30550
30551 // i64 vector arithmetic shift can be emulated with the transform:
30552 // M = lshr(SIGN_MASK, Amt)
30553 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30554 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30555 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30556 Opc == ISD::SRA) {
30557 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30558 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30559 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30560 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30561 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30562 return R;
30563 }
30564
30565 // XOP has 128-bit variable logical/arithmetic shifts.
30566 // +ve/-ve Amt = shift left/right.
30567 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30568 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30569 if (Opc == ISD::SRL || Opc == ISD::SRA)
30570 Amt = DAG.getNegative(Amt, dl, VT);
30571 if (Opc == ISD::SHL || Opc == ISD::SRL)
30572 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30573 if (Opc == ISD::SRA)
30574 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30575 }
30576
30577 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30578 // shifts per-lane and then shuffle the partial results back together.
30579 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30580 // Splat the shift amounts so the scalar shifts above will catch it.
30581 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30582 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30583 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30584 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30585 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30586 }
30587
30588 // Build a map of inrange constant amounts with element mask where they occur.
30590 if (ConstantAmt) {
30591 for (unsigned I = 0; I != NumElts; ++I) {
30592 SDValue A = Amt.getOperand(I);
30593 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30594 continue;
30595 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30596 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30597 if (!Inserted) {
30598 It->second.setBit(I);
30599 continue;
30600 }
30601 It->second = APInt::getOneBitSet(NumElts, I);
30602 }
30603 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30604 }
30605
30606 // If possible, lower this shift as a sequence of two shifts by
30607 // constant plus a BLENDing shuffle instead of scalarizing it.
30608 // Example:
30609 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30610 //
30611 // Could be rewritten as:
30612 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30613 //
30614 // The advantage is that the two shifts from the example would be
30615 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30616 if (UniqueCstAmt.size() == 2 &&
30617 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30618 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30619 unsigned AmtA = UniqueCstAmt.begin()->first;
30620 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30621 const APInt &MaskA = UniqueCstAmt.begin()->second;
30622 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30623 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30624 for (unsigned I = 0; I != NumElts; ++I) {
30625 if (MaskA[I])
30626 ShuffleMask[I] = I;
30627 if (MaskB[I])
30628 ShuffleMask[I] = I + NumElts;
30629 }
30630
30631 // Only perform this blend if we can perform it without loading a mask.
30632 if ((VT != MVT::v16i16 ||
30633 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30634 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30635 canWidenShuffleElements(ShuffleMask))) {
30636 SDValue Shift1 =
30637 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30638 SDValue Shift2 =
30639 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30640 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30641 }
30642 }
30643
30644 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30645 // using vYiM vector operations where X*N == Y*M and M > N.
30646 if (ConstantAmt &&
30647 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30648 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30649 !Subtarget.hasXOP()) {
30650 MVT NarrowScalarVT = VT.getScalarType();
30651 // We can do this extra fast if each pair of narrow elements is shifted by
30652 // the same amount by doing this SWAR style: use a shift to move the valid
30653 // bits to the right position, mask out any bits which crossed from one
30654 // element to the other.
30655 // This optimized lowering is only valid if the elements in a pair can
30656 // be treated identically.
30657 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30658 SmallVector<SDValue, 32> TmpAmtWideElts;
30659 int WideEltSizeInBits = EltSizeInBits;
30660 while (WideEltSizeInBits < 32) {
30661 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30662 // unprofitable.
30663 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30664 break;
30665 }
30666 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30667 bool SameShifts = true;
30668 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30669 unsigned DstI = SrcI / 2;
30670 // Both elements are undef? Make a note and keep going.
30671 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30672 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30673 continue;
30674 }
30675 // Even element is undef? We will shift it by the same shift amount as
30676 // the odd element.
30677 if (AmtWideElts[SrcI].isUndef()) {
30678 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30679 continue;
30680 }
30681 // Odd element is undef? We will shift it by the same shift amount as
30682 // the even element.
30683 if (AmtWideElts[SrcI + 1].isUndef()) {
30684 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30685 continue;
30686 }
30687 // Both elements are equal.
30688 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30689 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30690 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30691 continue;
30692 }
30693 // One of the provisional wide elements will not have the same shift
30694 // amount. Let's bail.
30695 SameShifts = false;
30696 break;
30697 }
30698 if (!SameShifts) {
30699 break;
30700 }
30701 WideEltSizeInBits *= 2;
30702 std::swap(TmpAmtWideElts, AmtWideElts);
30703 }
30704 APInt APIntShiftAmt;
30705 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30706 bool Profitable = WidenShift;
30707 // AVX512BW brings support for vpsllvw.
30708 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30709 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30710 Profitable = false;
30711 }
30712 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30713 // fairly cheaply in other ways.
30714 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30715 Profitable = false;
30716 }
30717 // Leave it up to GFNI if we have it around.
30718 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30719 // is probably a win to use other strategies in some cases.
30720 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30721 Profitable = false;
30722 }
30723
30724 // AVX1 does not have vpand which makes our masking impractical. It does
30725 // have vandps but that is an FP instruction and crossing FP<->int typically
30726 // has some cost.
30727 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30728 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30729 Profitable = false;
30730 }
30731 unsigned WideNumElts = AmtWideElts.size();
30732 // We are only dealing with identical pairs.
30733 if (Profitable && WideNumElts != NumElts) {
30734 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30735 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30736 // Cast the operand to vXiM.
30737 SDValue RWide = DAG.getBitcast(WideVT, R);
30738 // Create our new vector of shift amounts.
30739 SDValue AmtWide = DAG.getBuildVector(
30740 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30741 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30742 // Perform the actual shift.
30743 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30744 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30745 // Now we need to construct a mask which will "drop" bits that get
30746 // shifted past the LSB/MSB. For a logical shift left, it will look
30747 // like:
30748 // FullMask = (1 << EltSizeInBits) - 1
30749 // Mask = FullMask << Amt
30750 //
30751 // This masking ensures that bits cannot migrate from one narrow lane to
30752 // another. The construction of this mask will be constant folded.
30753 // The mask for a logical right shift is nearly identical, the only
30754 // difference is that the all ones mask is shifted right instead of left.
30755 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30756 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30757 Mask = DAG.getBitcast(WideVT, Mask);
30758 // Finally, we mask the shifted vector with the SWAR mask.
30759 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30760 Masked = DAG.getBitcast(VT, Masked);
30761 if (Opc != ISD::SRA) {
30762 // Logical shifts are complete at this point.
30763 return Masked;
30764 }
30765 // At this point, we have done a *logical* shift right. We now need to
30766 // sign extend the result so that we get behavior equivalent to an
30767 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30768 // are `EltSizeInBits-AmtWide` bits wide.
30769 //
30770 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30771 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30772 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30773 // can use the following trick to accomplish this:
30774 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30775 // (Masked ^ SignBitMask) - SignBitMask
30776 //
30777 // When the sign bit is already clear, this will compute:
30778 // Masked + SignBitMask - SignBitMask
30779 //
30780 // This is equal to Masked which is what we want: the sign bit was clear
30781 // so sign extending should be a no-op.
30782 //
30783 // When the sign bit is set, this will compute:
30784 // Masked - SignBitmask - SignBitMask
30785 //
30786 // This is equal to Masked - 2*SignBitMask which will correctly sign
30787 // extend our result.
30788 SDValue SplatHighBit =
30789 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30790 // This does not induce recursion, all operands are constants.
30791 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30792 SDValue FlippedSignBit =
30793 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30794 SDValue Subtraction =
30795 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30796 return Subtraction;
30797 }
30798 }
30799
30800 // If possible, lower this packed shift into a vector multiply instead of
30801 // expanding it into a sequence of scalar shifts.
30802 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30803 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30804 Subtarget.canExtendTo512BW())))
30805 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30806 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30807
30808 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30809 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30810 if (Opc == ISD::SRL && ConstantAmt &&
30811 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30812 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30813 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30814 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30815 SDValue Zero = DAG.getConstant(0, dl, VT);
30816 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30817 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30818 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30819 }
30820 }
30821
30822 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30823 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30824 // TODO: Special case handling for shift by 0/1, really we can afford either
30825 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30826 if (Opc == ISD::SRA && ConstantAmt &&
30827 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30828 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30829 !Subtarget.hasAVX512()) ||
30830 DAG.isKnownNeverZero(Amt))) {
30831 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30832 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30833 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30834 SDValue Amt0 =
30835 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30836 SDValue Amt1 =
30837 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30838 SDValue Sra1 =
30839 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30840 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30841 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30842 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30843 }
30844 }
30845
30846 // v4i32 Non Uniform Shifts.
30847 // If the shift amount is constant we can shift each lane using the SSE2
30848 // immediate shifts, else we need to zero-extend each lane to the lower i64
30849 // and shift using the SSE2 variable shifts.
30850 // The separate results can then be blended together.
30851 if (VT == MVT::v4i32) {
30852 SDValue Amt0, Amt1, Amt2, Amt3;
30853 if (ConstantAmt) {
30854 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30855 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30856 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30857 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30858 } else {
30859 // The SSE2 shifts use the lower i64 as the same shift amount for
30860 // all lanes and the upper i64 is ignored. On AVX we're better off
30861 // just zero-extending, but for SSE just duplicating the top 16-bits is
30862 // cheaper and has the same effect for out of range values.
30863 if (Subtarget.hasAVX()) {
30864 SDValue Z = DAG.getConstant(0, dl, VT);
30865 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30866 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30867 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30868 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30869 } else {
30870 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30871 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30872 {4, 5, 6, 7, -1, -1, -1, -1});
30873 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30874 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30875 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30876 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30877 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30878 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30879 }
30880 }
30881
30882 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30883 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30884 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30885 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30886 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30887
30888 // Merge the shifted lane results optimally with/without PBLENDW.
30889 // TODO - ideally shuffle combining would handle this.
30890 if (Subtarget.hasSSE41()) {
30891 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30892 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30893 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30894 }
30895 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30896 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30897 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30898 }
30899
30900 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30901 // look up the pre-computed shift values.
30902 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30903 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30904 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30905 unsigned NumLanes = VT.getSizeInBits() / 128u;
30906 unsigned NumEltsPerLane = NumElts / NumLanes;
30908 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30909 unsigned LoElt = Lane * NumEltsPerLane;
30910 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30911 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30912 if (!KnownLane.isConstant())
30913 break;
30914 const APInt &LaneSplat = KnownLane.getConstant();
30915 for (unsigned I = 0; I != 8; ++I) {
30916 if (Opc == ISD::SHL)
30917 LUT.push_back(LaneSplat.shl(I));
30918 else if (Opc == ISD::SRL)
30919 LUT.push_back(LaneSplat.lshr(I));
30920 else if (Opc == ISD::SRA)
30921 LUT.push_back(LaneSplat.ashr(I));
30922 }
30923 LUT.append(8, APInt::getZero(8));
30924 }
30925 if (LUT.size() == NumElts) {
30926 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30927 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30928 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30929 }
30930 }
30931
30932 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30933 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30934 // make the existing SSE solution better.
30935 // NOTE: We honor prefered vector width before promoting to 512-bits.
30936 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30937 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30938 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30939 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30940 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30941 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30942 "Unexpected vector type");
30943 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30944 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30945 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30946 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30947 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30948 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30949 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30950 }
30951
30952 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30953 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30954 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30955 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30956 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30957 !Subtarget.hasXOP()) {
30958 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30959 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30960
30961 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30962 // isn't legal).
30963 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30964 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30965 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30966 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30968 "Constant build vector expected");
30969
30970 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30971 bool IsSigned = Opc == ISD::SRA;
30972 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30973 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30974 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30975 return DAG.getZExtOrTrunc(R, dl, VT);
30976 }
30977
30978 SmallVector<SDValue, 16> LoAmt, HiAmt;
30979 for (unsigned i = 0; i != NumElts; i += 16) {
30980 for (int j = 0; j != 8; ++j) {
30981 LoAmt.push_back(Amt.getOperand(i + j));
30982 HiAmt.push_back(Amt.getOperand(i + j + 8));
30983 }
30984 }
30985
30986 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30987 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30988
30989 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30990 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30991 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30992 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30993 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30994 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30995 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30996 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30997 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30998 }
30999
31000 if (VT == MVT::v16i8 ||
31001 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31002 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31003 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31004
31005 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31006 if (VT.is512BitVector()) {
31007 // On AVX512BW targets we make use of the fact that VSELECT lowers
31008 // to a masked blend which selects bytes based just on the sign bit
31009 // extracted to a mask.
31010 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31011 V0 = DAG.getBitcast(VT, V0);
31012 V1 = DAG.getBitcast(VT, V1);
31013 Sel = DAG.getBitcast(VT, Sel);
31014 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31015 ISD::SETGT);
31016 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31017 } else if (Subtarget.hasSSE41()) {
31018 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31019 // on the sign bit.
31020 V0 = DAG.getBitcast(VT, V0);
31021 V1 = DAG.getBitcast(VT, V1);
31022 Sel = DAG.getBitcast(VT, Sel);
31023 return DAG.getBitcast(SelVT,
31024 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31025 }
31026 // On pre-SSE41 targets we test for the sign bit by comparing to
31027 // zero - a negative value will set all bits of the lanes to true
31028 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31029 SDValue Z = DAG.getConstant(0, dl, SelVT);
31030 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31031 return DAG.getSelect(dl, SelVT, C, V0, V1);
31032 };
31033
31034 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31035 // We can safely do this using i16 shifts as we're only interested in
31036 // the 3 lower bits of each byte.
31037 Amt = DAG.getBitcast(ExtVT, Amt);
31038 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31039 Amt = DAG.getBitcast(VT, Amt);
31040
31041 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31042 // r = VSELECT(r, shift(r, 4), a);
31043 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31044 R = SignBitSelect(VT, Amt, M, R);
31045
31046 // a += a
31047 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31048
31049 // r = VSELECT(r, shift(r, 2), a);
31050 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31051 R = SignBitSelect(VT, Amt, M, R);
31052
31053 // a += a
31054 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31055
31056 // return VSELECT(r, shift(r, 1), a);
31057 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31058 R = SignBitSelect(VT, Amt, M, R);
31059 return R;
31060 }
31061
31062 if (Opc == ISD::SRA) {
31063 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31064 // so we can correctly sign extend. We don't care what happens to the
31065 // lower byte.
31066 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31067 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31068 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31069 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31070 ALo = DAG.getBitcast(ExtVT, ALo);
31071 AHi = DAG.getBitcast(ExtVT, AHi);
31072 RLo = DAG.getBitcast(ExtVT, RLo);
31073 RHi = DAG.getBitcast(ExtVT, RHi);
31074
31075 // r = VSELECT(r, shift(r, 4), a);
31076 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31077 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31078 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31079 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31080
31081 // a += a
31082 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31083 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31084
31085 // r = VSELECT(r, shift(r, 2), a);
31086 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31087 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31088 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31089 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31090
31091 // a += a
31092 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31093 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31094
31095 // r = VSELECT(r, shift(r, 1), a);
31096 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31097 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31098 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31099 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31100
31101 // Logical shift the result back to the lower byte, leaving a zero upper
31102 // byte meaning that we can safely pack with PACKUSWB.
31103 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31104 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31105 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31106 }
31107 }
31108
31109 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31110 MVT ExtVT = MVT::v8i32;
31111 SDValue Z = DAG.getConstant(0, dl, VT);
31112 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31113 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31114 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31115 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31116 ALo = DAG.getBitcast(ExtVT, ALo);
31117 AHi = DAG.getBitcast(ExtVT, AHi);
31118 RLo = DAG.getBitcast(ExtVT, RLo);
31119 RHi = DAG.getBitcast(ExtVT, RHi);
31120 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31121 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31122 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31123 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31124 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31125 }
31126
31127 if (VT == MVT::v8i16) {
31128 // If we have a constant shift amount, the non-SSE41 path is best as
31129 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31130 bool UseSSE41 = Subtarget.hasSSE41() &&
31132
31133 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31134 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31135 // the sign bit.
31136 if (UseSSE41) {
31137 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31138 V0 = DAG.getBitcast(ExtVT, V0);
31139 V1 = DAG.getBitcast(ExtVT, V1);
31140 Sel = DAG.getBitcast(ExtVT, Sel);
31141 return DAG.getBitcast(
31142 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31143 }
31144 // On pre-SSE41 targets we splat the sign bit - a negative value will
31145 // set all bits of the lanes to true and VSELECT uses that in
31146 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31147 SDValue C =
31148 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31149 return DAG.getSelect(dl, VT, C, V0, V1);
31150 };
31151
31152 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31153 if (UseSSE41) {
31154 // On SSE41 targets we need to replicate the shift mask in both
31155 // bytes for PBLENDVB.
31156 Amt = DAG.getNode(
31157 ISD::OR, dl, VT,
31158 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31159 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31160 } else {
31161 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31162 }
31163
31164 // r = VSELECT(r, shift(r, 8), a);
31165 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31166 R = SignBitSelect(Amt, M, R);
31167
31168 // a += a
31169 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31170
31171 // r = VSELECT(r, shift(r, 4), a);
31172 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31173 R = SignBitSelect(Amt, M, R);
31174
31175 // a += a
31176 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31177
31178 // r = VSELECT(r, shift(r, 2), a);
31179 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31180 R = SignBitSelect(Amt, M, R);
31181
31182 // a += a
31183 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31184
31185 // return VSELECT(r, shift(r, 1), a);
31186 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31187 R = SignBitSelect(Amt, M, R);
31188 return R;
31189 }
31190
31191 // Decompose 256-bit shifts into 128-bit shifts.
31192 if (VT.is256BitVector())
31193 return splitVectorIntBinary(Op, DAG, dl);
31194
31195 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31196 return splitVectorIntBinary(Op, DAG, dl);
31197
31198 return SDValue();
31199}
31200
31202 SelectionDAG &DAG) {
31203 MVT VT = Op.getSimpleValueType();
31204 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31205 "Unexpected funnel shift opcode!");
31206
31207 SDLoc DL(Op);
31208 SDValue Op0 = Op.getOperand(0);
31209 SDValue Op1 = Op.getOperand(1);
31210 SDValue Amt = Op.getOperand(2);
31211 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31212 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31213
31214 if (VT.isVector()) {
31215 APInt APIntShiftAmt;
31216 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31217 unsigned NumElts = VT.getVectorNumElements();
31218
31219 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31220
31221 if (IsCstSplat) {
31222 if (IsFSHR)
31223 std::swap(Op0, Op1);
31224 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31225 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31226 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31227 {Op0, Op1, Imm}, DAG, Subtarget);
31228 }
31229 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31230 {Op0, Op1, Amt}, DAG, Subtarget);
31231 }
31232 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31233 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31234 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31235 "Unexpected funnel shift type!");
31236
31237 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31238 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31239 if (IsCstSplat) {
31240 // TODO: Can't use generic expansion as UNDEF amt elements can be
31241 // converted to other values when folded to shift amounts, losing the
31242 // splat.
31243 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31244 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31245 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31246 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31247 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31248
31249 if (EltSizeInBits == 8 &&
31250 (Subtarget.hasXOP() ||
31251 (useVPTERNLOG(Subtarget, VT) &&
31252 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31253 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31254 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31255 // the original vector width to handle cases where we split.
31256 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31257 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31258 SDValue ShX =
31259 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31260 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31261 SDValue ShY =
31262 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31263 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31264 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31265 DAG.getConstant(MaskX, DL, VT));
31266 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31267 DAG.getConstant(MaskY, DL, VT));
31268 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31269 }
31270
31271 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31272 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31273 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31274 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31275 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31276 }
31277
31278 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31279 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31280 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31281
31282 // Constant vXi16 funnel shifts can be efficiently handled by default.
31283 if (IsCst && EltSizeInBits == 16)
31284 return SDValue();
31285
31286 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31287 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31288 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31289
31290 // Split 256-bit integers on XOP/pre-AVX2 targets.
31291 // Split 512-bit integers on non 512-bit BWI targets.
31292 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31293 !Subtarget.hasAVX2())) ||
31294 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31295 EltSizeInBits < 32)) {
31296 // Pre-mask the amount modulo using the wider vector.
31297 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31298 return splitVectorOp(Op, DAG, DL);
31299 }
31300
31301 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31302 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31303 int ScalarAmtIdx = -1;
31304 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31305 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31306 if (EltSizeInBits == 16)
31307 return SDValue();
31308
31309 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31310 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31311 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31312 ScalarAmtIdx, Subtarget, DAG);
31313 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31314 ScalarAmtIdx, Subtarget, DAG);
31315 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31316 }
31317 }
31318
31319 MVT WideSVT = MVT::getIntegerVT(
31320 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31321 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31322
31323 // If per-element shifts are legal, fallback to generic expansion.
31324 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31325 return SDValue();
31326
31327 // Attempt to fold as:
31328 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31329 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31330 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31331 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31332 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31333 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31334 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31335 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31336 EltSizeInBits, DAG);
31337 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31338 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31339 if (!IsFSHR)
31340 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31341 EltSizeInBits, DAG);
31342 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31343 }
31344
31345 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31346 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31347 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31348 SDValue Z = DAG.getConstant(0, DL, VT);
31349 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31350 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31351 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31352 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31353 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31354 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31355 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31356 }
31357
31358 // Fallback to generic expansion.
31359 return SDValue();
31360 }
31361 assert(
31362 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31363 "Unexpected funnel shift type!");
31364
31365 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31366 bool OptForSize = DAG.shouldOptForSize();
31367 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31368
31369 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31370 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31371 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31372 !isa<ConstantSDNode>(Amt)) {
31373 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31374 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31375 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31376 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31377 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31378 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31379 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31380 if (IsFSHR) {
31381 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31382 } else {
31383 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31384 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31385 }
31386 return DAG.getZExtOrTrunc(Res, DL, VT);
31387 }
31388
31389 if (VT == MVT::i8 || ExpandFunnel)
31390 return SDValue();
31391
31392 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31393 if (VT == MVT::i16) {
31394 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31395 DAG.getConstant(15, DL, Amt.getValueType()));
31396 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31397 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31398 }
31399
31400 return Op;
31401}
31402
31403static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31404 SelectionDAG &DAG) {
31405 MVT VT = Op.getSimpleValueType();
31406 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31407
31408 SDLoc DL(Op);
31409 SDValue R = Op.getOperand(0);
31410 SDValue Amt = Op.getOperand(1);
31411 unsigned Opcode = Op.getOpcode();
31412 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31413 int NumElts = VT.getVectorNumElements();
31414 bool IsROTL = Opcode == ISD::ROTL;
31415
31416 // Check for constant splat rotation amount.
31417 APInt CstSplatValue;
31418 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31419
31420 // Check for splat rotate by zero.
31421 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31422 return R;
31423
31424 // AVX512 implicitly uses modulo rotation amounts.
31425 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31426 // Attempt to rotate by immediate.
31427 if (IsCstSplat) {
31428 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31429 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31430 return DAG.getNode(RotOpc, DL, VT, R,
31431 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31432 }
31433
31434 // Else, fall-back on VPROLV/VPRORV.
31435 return Op;
31436 }
31437
31438 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31439 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31440 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31441 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31442 }
31443
31444 SDValue Z = DAG.getConstant(0, DL, VT);
31445
31446 if (!IsROTL) {
31447 // If the ISD::ROTR amount is constant, we're always better converting to
31448 // ISD::ROTL.
31449 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31450 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31451
31452 // XOP targets always prefers ISD::ROTL.
31453 if (Subtarget.hasXOP())
31454 return DAG.getNode(ISD::ROTL, DL, VT, R,
31455 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31456 }
31457
31458 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31459 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31461 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31462 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31463 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31464 DAG.getTargetConstant(0, DL, MVT::i8));
31465 }
31466
31467 // Split 256-bit integers on XOP/pre-AVX2 targets.
31468 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31469 return splitVectorIntBinary(Op, DAG, DL);
31470
31471 // XOP has 128-bit vector variable + immediate rotates.
31472 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31473 // XOP implicitly uses modulo rotation amounts.
31474 if (Subtarget.hasXOP()) {
31475 assert(IsROTL && "Only ROTL expected");
31476 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31477
31478 // Attempt to rotate by immediate.
31479 if (IsCstSplat) {
31480 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31481 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31482 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31483 }
31484
31485 // Use general rotate by variable (per-element).
31486 return Op;
31487 }
31488
31489 // Rotate by an uniform constant - expand back to shifts.
31490 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31491 // to other values when folded to shift amounts, losing the splat.
31492 if (IsCstSplat) {
31493 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31494 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31495 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31496 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31497 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31498 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31499 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31500 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31501 }
31502
31503 // Split 512-bit integers on non 512-bit BWI targets.
31504 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31505 return splitVectorIntBinary(Op, DAG, DL);
31506
31507 assert(
31508 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31509 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31510 Subtarget.hasAVX2()) ||
31511 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31512 "Only vXi32/vXi16/vXi8 vector rotates supported");
31513
31514 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31515 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31516
31517 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31518 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31519
31520 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31521 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31522 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31523 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31524 int BaseRotAmtIdx = -1;
31525 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31526 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31527 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31528 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31529 }
31530 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31531 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31532 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31533 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31534 BaseRotAmtIdx, Subtarget, DAG);
31535 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31536 BaseRotAmtIdx, Subtarget, DAG);
31537 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31538 }
31539 }
31540
31541 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31542 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31543
31544 // Attempt to fold as unpack(x,x) << zext(y):
31545 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31546 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31547 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31548 if (!(ConstantAmt && EltSizeInBits != 8) &&
31549 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31550 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31551 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31552 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31553 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31554 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31555 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31556 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31557 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31558 }
31559
31560 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31561 // the amount bit.
31562 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31563 if (EltSizeInBits == 8) {
31564 MVT WideVT =
31565 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31566
31567 // Attempt to fold as:
31568 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31569 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31570 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31571 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31572 // If we're rotating by constant, just use default promotion.
31573 if (ConstantAmt)
31574 return SDValue();
31575 // See if we can perform this by widening to vXi16 or vXi32.
31576 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31577 R = DAG.getNode(
31578 ISD::OR, DL, WideVT, R,
31579 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31580 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31581 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31582 if (IsROTL)
31583 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31584 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31585 }
31586
31587 // We don't need ModuloAmt here as we just peek at individual bits.
31588 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31589 if (Subtarget.hasSSE41()) {
31590 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31591 // on the sign bit.
31592 V0 = DAG.getBitcast(VT, V0);
31593 V1 = DAG.getBitcast(VT, V1);
31594 Sel = DAG.getBitcast(VT, Sel);
31595 return DAG.getBitcast(SelVT,
31596 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31597 }
31598 // On pre-SSE41 targets we test for the sign bit by comparing to
31599 // zero - a negative value will set all bits of the lanes to true
31600 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31601 SDValue Z = DAG.getConstant(0, DL, SelVT);
31602 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31603 return DAG.getSelect(DL, SelVT, C, V0, V1);
31604 };
31605
31606 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31607 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31608 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31609 IsROTL = true;
31610 }
31611
31612 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31613 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31614
31615 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31616 // We can safely do this using i16 shifts as we're only interested in
31617 // the 3 lower bits of each byte.
31618 Amt = DAG.getBitcast(ExtVT, Amt);
31619 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31620 Amt = DAG.getBitcast(VT, Amt);
31621
31622 // r = VSELECT(r, rot(r, 4), a);
31623 SDValue M;
31624 M = DAG.getNode(
31625 ISD::OR, DL, VT,
31626 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31627 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31628 R = SignBitSelect(VT, Amt, M, R);
31629
31630 // a += a
31631 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31632
31633 // r = VSELECT(r, rot(r, 2), a);
31634 M = DAG.getNode(
31635 ISD::OR, DL, VT,
31636 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31637 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31638 R = SignBitSelect(VT, Amt, M, R);
31639
31640 // a += a
31641 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31642
31643 // return VSELECT(r, rot(r, 1), a);
31644 M = DAG.getNode(
31645 ISD::OR, DL, VT,
31646 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31647 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31648 return SignBitSelect(VT, Amt, M, R);
31649 }
31650
31651 bool IsSplatAmt = DAG.isSplatValue(Amt);
31652 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31653 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31654
31655 // Fallback for splats + all supported variable shifts.
31656 // Fallback for non-constants AVX2 vXi16 as well.
31657 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31658 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31659 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31660 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31661 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31662 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31663 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31664 }
31665
31666 // Everything below assumes ISD::ROTL.
31667 if (!IsROTL) {
31668 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31669 IsROTL = true;
31670 }
31671
31672 // ISD::ROT* uses modulo rotate amounts.
31673 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31674
31675 assert(IsROTL && "Only ROTL supported");
31676
31677 // As with shifts, attempt to convert the rotation amount to a multiplication
31678 // factor, fallback to general expansion.
31679 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31680 if (!Scale)
31681 return SDValue();
31682
31683 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31684 if (EltSizeInBits == 16) {
31685 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31686 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31687 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31688 }
31689
31690 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31691 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31692 // that can then be OR'd with the lower 32-bits.
31693 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31694 static const int OddMask[] = {1, 1, 3, 3};
31695 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31696 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31697
31698 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31699 DAG.getBitcast(MVT::v2i64, R),
31700 DAG.getBitcast(MVT::v2i64, Scale));
31701 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31702 DAG.getBitcast(MVT::v2i64, R13),
31703 DAG.getBitcast(MVT::v2i64, Scale13));
31704 Res02 = DAG.getBitcast(VT, Res02);
31705 Res13 = DAG.getBitcast(VT, Res13);
31706
31707 return DAG.getNode(ISD::OR, DL, VT,
31708 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31709 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31710}
31711
31712/// Returns true if the operand type is exactly twice the native width, and
31713/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31714/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31715/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31716bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31717 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31718
31719 if (OpWidth == 64)
31720 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31721 if (OpWidth == 128)
31722 return Subtarget.canUseCMPXCHG16B();
31723
31724 return false;
31725}
31726
31728X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31729 Type *MemType = SI->getValueOperand()->getType();
31730
31731 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31732 !Subtarget.useSoftFloat()) {
31733 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31734 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31736
31737 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31738 Subtarget.hasAVX())
31740 }
31741
31742 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31744}
31745
31746// Note: this turns large loads into lock cmpxchg8b/16b.
31748X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31749 Type *MemType = LI->getType();
31750
31751 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31752 !Subtarget.useSoftFloat()) {
31753 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31754 // can use movq to do the load. If we have X87 we can load into an 80-bit
31755 // X87 register and store it to a stack temporary.
31756 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31757 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31759
31760 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31761 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31762 Subtarget.hasAVX())
31764 }
31765
31766 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31768}
31769
31777
31778static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31779 using namespace llvm::PatternMatch;
31780 BitTestKind BTK = UndefBit;
31781 if (auto *C = dyn_cast<ConstantInt>(V)) {
31782 // Check if V is a power of 2 or NOT power of 2.
31783 if (isPowerOf2_64(C->getZExtValue()))
31784 BTK = ConstantBit;
31785 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31786 BTK = NotConstantBit;
31787 return {V, BTK};
31788 }
31789
31790 // Check if V is some power of 2 pattern known to be non-zero
31791 if (auto *I = dyn_cast<Instruction>(V)) {
31792 bool Not = false;
31793 // Check if we have a NOT
31794 Value *PeekI;
31795 if (match(I, m_Not(m_Value(PeekI))) ||
31796 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31797 Not = true;
31798 I = dyn_cast<Instruction>(PeekI);
31799
31800 // If I is constant, it will fold and we can evaluate later. If its an
31801 // argument or something of that nature, we can't analyze.
31802 if (I == nullptr)
31803 return {nullptr, UndefBit};
31804 }
31805 // We can only use 1 << X without more sophisticated analysis. C << X where
31806 // C is a power of 2 but not 1 can result in zero which cannot be translated
31807 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31808 if (I->getOpcode() == Instruction::Shl) {
31809 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31810 // -X` and some other provable power of 2 patterns that we can use CTZ on
31811 // may be profitable.
31812 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31813 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31814 // be provably a non-zero power of 2.
31815 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31816 // transformable to bittest.
31817 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31818 if (!ShiftVal)
31819 return {nullptr, UndefBit};
31820 if (ShiftVal->equalsInt(1))
31821 BTK = Not ? NotShiftBit : ShiftBit;
31822
31823 if (BTK == UndefBit)
31824 return {nullptr, UndefBit};
31825
31826 Value *BitV = I->getOperand(1);
31827
31828 // Read past a shiftmask instruction to find count
31829 Value *AndOp;
31830 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31831 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31832 BitV = AndOp;
31833
31834 return {BitV, BTK};
31835 }
31836 }
31837 return {nullptr, UndefBit};
31838}
31839
31841X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31842 using namespace llvm::PatternMatch;
31843 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31844 // prefix to a normal instruction for these operations.
31845 if (AI->use_empty())
31847
31848 if (AI->getOperation() == AtomicRMWInst::Xor) {
31849 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31850 // preferable to both `cmpxchg` and `btc`.
31851 if (match(AI->getOperand(1), m_SignMask()))
31853 }
31854
31855 // If the atomicrmw's result is used by a single bit AND, we may use
31856 // bts/btr/btc instruction for these operations.
31857 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31858 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31859 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31860 // detect it.
31861 Instruction *I = AI->user_back();
31862 auto BitChange = FindSingleBitChange(AI->getValOperand());
31863 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31864 I->getOpcode() != Instruction::And ||
31865 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31866 AI->getParent() != I->getParent())
31868
31869 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31870
31871 // This is a redundant AND, it should get cleaned up elsewhere.
31872 if (AI == I->getOperand(OtherIdx))
31874
31875 // The following instruction must be a AND single bit.
31876 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31877 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31878 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31879 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31881 }
31882 if (AI->getOperation() == AtomicRMWInst::And) {
31883 return ~C1->getValue() == C2->getValue()
31886 }
31889 }
31890
31891 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31892
31893 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31894 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31896
31897 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31898
31899 // If shift amounts are not the same we can't use BitTestIntrinsic.
31900 if (BitChange.first != BitTested.first)
31902
31903 // If atomic AND need to be masking all be one bit and testing the one bit
31904 // unset in the mask.
31905 if (AI->getOperation() == AtomicRMWInst::And)
31906 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31909
31910 // If atomic XOR/OR need to be setting and testing the same bit.
31911 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31914}
31915
31916void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31917 IRBuilder<> Builder(AI);
31918 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31921 switch (AI->getOperation()) {
31922 default:
31923 llvm_unreachable("Unknown atomic operation");
31924 case AtomicRMWInst::Or:
31925 IID_C = Intrinsic::x86_atomic_bts;
31926 IID_I = Intrinsic::x86_atomic_bts_rm;
31927 break;
31928 case AtomicRMWInst::Xor:
31929 IID_C = Intrinsic::x86_atomic_btc;
31930 IID_I = Intrinsic::x86_atomic_btc_rm;
31931 break;
31932 case AtomicRMWInst::And:
31933 IID_C = Intrinsic::x86_atomic_btr;
31934 IID_I = Intrinsic::x86_atomic_btr_rm;
31935 break;
31936 }
31937 Instruction *I = AI->user_back();
31938 LLVMContext &Ctx = AI->getContext();
31939 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31941 Value *Result = nullptr;
31942 auto BitTested = FindSingleBitChange(AI->getValOperand());
31943 assert(BitTested.first != nullptr);
31944
31945 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31946 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31947
31948 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31949 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31950 {Addr, Builder.getInt8(Imm)});
31951 } else {
31952 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31953
31954 Value *SI = BitTested.first;
31955 assert(SI != nullptr);
31956
31957 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31958 // mask it.
31959 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31960 Value *BitPos =
31961 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31962 // Todo(1): In many cases it may be provable that SI is less than
31963 // ShiftBits in which case this mask is unnecessary
31964 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31965 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31966 // favor of just a raw BT{S|R|C}.
31967
31968 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31969 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31970
31971 // If the result is only used for zero/non-zero status then we don't need to
31972 // shift value back. Otherwise do so.
31973 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31974 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31975 if (ICmp->isEquality()) {
31976 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31977 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31978 if (C0 || C1) {
31979 assert(C0 == nullptr || C1 == nullptr);
31980 if ((C0 ? C0 : C1)->isZero())
31981 continue;
31982 }
31983 }
31984 }
31985 Result = Builder.CreateShl(Result, BitPos);
31986 break;
31987 }
31988 }
31989
31990 I->replaceAllUsesWith(Result);
31991 I->eraseFromParent();
31992 AI->eraseFromParent();
31993}
31994
31996 using namespace llvm::PatternMatch;
31997 if (!AI->hasOneUse())
31998 return false;
31999
32000 Value *Op = AI->getOperand(1);
32001 CmpPredicate Pred;
32002 Instruction *I = AI->user_back();
32004 if (Opc == AtomicRMWInst::Add) {
32005 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32006 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32007 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32008 if (match(I->user_back(),
32010 return true;
32011 if (match(I->user_back(),
32013 return true;
32014 }
32015 return false;
32016 }
32017 if (Opc == AtomicRMWInst::Sub) {
32018 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32019 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32020 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32021 if (match(I->user_back(),
32023 return true;
32024 if (match(I->user_back(),
32026 return true;
32027 }
32028 return false;
32029 }
32030 if ((Opc == AtomicRMWInst::Or &&
32032 (Opc == AtomicRMWInst::And &&
32034 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32035 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32036 Pred == CmpInst::ICMP_SLT;
32037 if (match(I->user_back(),
32039 return true;
32040 return false;
32041 }
32042 if (Opc == AtomicRMWInst::Xor) {
32043 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32044 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32045 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32046 if (match(I->user_back(),
32048 return true;
32049 if (match(I->user_back(),
32051 return true;
32052 }
32053 return false;
32054 }
32055
32056 return false;
32057}
32058
32059void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32060 AtomicRMWInst *AI) const {
32061 IRBuilder<> Builder(AI);
32062 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32063 Instruction *TempI = nullptr;
32064 LLVMContext &Ctx = AI->getContext();
32065 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32066 if (!ICI) {
32067 TempI = AI->user_back();
32068 assert(TempI->hasOneUse() && "Must have one use");
32069 ICI = cast<ICmpInst>(TempI->user_back());
32070 }
32072 ICmpInst::Predicate Pred = ICI->getPredicate();
32073 switch (Pred) {
32074 default:
32075 llvm_unreachable("Not supported Pred");
32076 case CmpInst::ICMP_EQ:
32077 CC = X86::COND_E;
32078 break;
32079 case CmpInst::ICMP_NE:
32080 CC = X86::COND_NE;
32081 break;
32082 case CmpInst::ICMP_SLT:
32083 CC = X86::COND_S;
32084 break;
32085 case CmpInst::ICMP_SGT:
32086 CC = X86::COND_NS;
32087 break;
32088 }
32090 switch (AI->getOperation()) {
32091 default:
32092 llvm_unreachable("Unknown atomic operation");
32093 case AtomicRMWInst::Add:
32094 IID = Intrinsic::x86_atomic_add_cc;
32095 break;
32096 case AtomicRMWInst::Sub:
32097 IID = Intrinsic::x86_atomic_sub_cc;
32098 break;
32099 case AtomicRMWInst::Or:
32100 IID = Intrinsic::x86_atomic_or_cc;
32101 break;
32102 case AtomicRMWInst::And:
32103 IID = Intrinsic::x86_atomic_and_cc;
32104 break;
32105 case AtomicRMWInst::Xor:
32106 IID = Intrinsic::x86_atomic_xor_cc;
32107 break;
32108 }
32109 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32111 Value *Call = Builder.CreateIntrinsic(
32112 IID, AI->getType(),
32113 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32114 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32115 ICI->replaceAllUsesWith(Result);
32116 ICI->eraseFromParent();
32117 if (TempI)
32118 TempI->eraseFromParent();
32119 AI->eraseFromParent();
32120}
32121
32123X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32124 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32125 Type *MemType = AI->getType();
32126
32127 // If the operand is too big, we must see if cmpxchg8/16b is available
32128 // and default to library calls otherwise.
32129 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32130 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32132 }
32133
32135 switch (Op) {
32138 case AtomicRMWInst::Add:
32139 case AtomicRMWInst::Sub:
32142 // It's better to use xadd, xsub or xchg for these in other cases.
32144 case AtomicRMWInst::Or:
32145 case AtomicRMWInst::And:
32146 case AtomicRMWInst::Xor:
32149 return shouldExpandLogicAtomicRMWInIR(AI);
32151 case AtomicRMWInst::Max:
32152 case AtomicRMWInst::Min:
32163 default:
32164 // These always require a non-trivial set of data operations on x86. We must
32165 // use a cmpxchg loop.
32167 }
32168}
32169
32170LoadInst *
32171X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32172 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32173 Type *MemType = AI->getType();
32174 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32175 // there is no benefit in turning such RMWs into loads, and it is actually
32176 // harmful as it introduces a mfence.
32177 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32178 return nullptr;
32179
32180 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32181 // lowering available in lowerAtomicArith.
32182 // TODO: push more cases through this path.
32183 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32184 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32185 AI->use_empty())
32186 return nullptr;
32187
32188 IRBuilder<> Builder(AI);
32189 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32190 auto SSID = AI->getSyncScopeID();
32191 // We must restrict the ordering to avoid generating loads with Release or
32192 // ReleaseAcquire orderings.
32194
32195 // Before the load we need a fence. Here is an example lifted from
32196 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32197 // is required:
32198 // Thread 0:
32199 // x.store(1, relaxed);
32200 // r1 = y.fetch_add(0, release);
32201 // Thread 1:
32202 // y.fetch_add(42, acquire);
32203 // r2 = x.load(relaxed);
32204 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32205 // lowered to just a load without a fence. A mfence flushes the store buffer,
32206 // making the optimization clearly correct.
32207 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32208 // otherwise, we might be able to be more aggressive on relaxed idempotent
32209 // rmw. In practice, they do not look useful, so we don't try to be
32210 // especially clever.
32211
32212 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32213 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32214 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32215
32216 // Finally we can emit the atomic load.
32217 LoadInst *Loaded = Builder.CreateAlignedLoad(
32218 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32219 Loaded->setAtomic(Order, SSID);
32220 AI->replaceAllUsesWith(Loaded);
32221 AI->eraseFromParent();
32222 return Loaded;
32223}
32224
32225/// Emit a locked operation on a stack location which does not change any
32226/// memory location, but does involve a lock prefix. Location is chosen to be
32227/// a) very likely accessed only by a single thread to minimize cache traffic,
32228/// and b) definitely dereferenceable. Returns the new Chain result.
32230 const X86Subtarget &Subtarget, SDValue Chain,
32231 const SDLoc &DL) {
32232 // Implementation notes:
32233 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32234 // operations issued by the current processor. As such, the location
32235 // referenced is not relevant for the ordering properties of the instruction.
32236 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32237 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32238 // 2) Using an immediate operand appears to be the best encoding choice
32239 // here since it doesn't require an extra register.
32240 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32241 // is small enough it might just be measurement noise.)
32242 // 4) When choosing offsets, there are several contributing factors:
32243 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32244 // line aligned stack object to improve this case.)
32245 // b) To minimize our chances of introducing a false dependence, we prefer
32246 // to offset the stack usage from TOS slightly.
32247 // c) To minimize concerns about cross thread stack usage - in particular,
32248 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32249 // captures state in the TOS frame and accesses it from many threads -
32250 // we want to use an offset such that the offset is in a distinct cache
32251 // line from the TOS frame.
32252 //
32253 // For a general discussion of the tradeoffs and benchmark results, see:
32254 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32255
32256 auto &MF = DAG.getMachineFunction();
32257 auto &TFL = *Subtarget.getFrameLowering();
32258 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32259
32260 if (Subtarget.is64Bit()) {
32261 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32262 SDValue Ops[] = {
32263 DAG.getRegister(X86::RSP, MVT::i64), // Base
32264 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32265 DAG.getRegister(0, MVT::i64), // Index
32266 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32267 DAG.getRegister(0, MVT::i16), // Segment.
32268 Zero,
32269 Chain};
32270 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32271 MVT::Other, Ops);
32272 return SDValue(Res, 1);
32273 }
32274
32275 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32276 SDValue Ops[] = {
32277 DAG.getRegister(X86::ESP, MVT::i32), // Base
32278 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32279 DAG.getRegister(0, MVT::i32), // Index
32280 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32281 DAG.getRegister(0, MVT::i16), // Segment.
32282 Zero,
32283 Chain
32284 };
32285 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32286 MVT::Other, Ops);
32287 return SDValue(Res, 1);
32288}
32289
32291 SelectionDAG &DAG) {
32292 SDLoc dl(Op);
32293 AtomicOrdering FenceOrdering =
32294 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32295 SyncScope::ID FenceSSID =
32296 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32297
32298 // The only fence that needs an instruction is a sequentially-consistent
32299 // cross-thread fence.
32300 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32301 FenceSSID == SyncScope::System) {
32302 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32303 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32304
32305 SDValue Chain = Op.getOperand(0);
32306 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32307 }
32308
32309 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32310 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32311}
32312
32314 SelectionDAG &DAG) {
32315 MVT T = Op.getSimpleValueType();
32316 SDLoc DL(Op);
32317 unsigned Reg = 0;
32318 unsigned size = 0;
32319 switch(T.SimpleTy) {
32320 default: llvm_unreachable("Invalid value type!");
32321 case MVT::i8: Reg = X86::AL; size = 1; break;
32322 case MVT::i16: Reg = X86::AX; size = 2; break;
32323 case MVT::i32: Reg = X86::EAX; size = 4; break;
32324 case MVT::i64:
32325 assert(Subtarget.is64Bit() && "Node not type legal!");
32326 Reg = X86::RAX; size = 8;
32327 break;
32328 }
32329 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32330 Op.getOperand(2), SDValue());
32331 SDValue Ops[] = { cpIn.getValue(0),
32332 Op.getOperand(1),
32333 Op.getOperand(3),
32334 DAG.getTargetConstant(size, DL, MVT::i8),
32335 cpIn.getValue(1) };
32336 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32337 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32339 Ops, T, MMO);
32340
32341 SDValue cpOut =
32342 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32343 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32344 MVT::i32, cpOut.getValue(2));
32345 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32346
32347 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32348 cpOut, Success, EFLAGS.getValue(1));
32349}
32350
32351// Create MOVMSKB, taking into account whether we need to split for AVX1.
32353 const X86Subtarget &Subtarget) {
32354 MVT InVT = V.getSimpleValueType();
32355
32356 if (InVT == MVT::v64i8) {
32357 SDValue Lo, Hi;
32358 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32359 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32360 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32361 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32362 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32363 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32364 DAG.getConstant(32, DL, MVT::i8));
32365 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32366 }
32367 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32368 SDValue Lo, Hi;
32369 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32370 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32371 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32372 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32373 DAG.getConstant(16, DL, MVT::i8));
32374 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32375 }
32376
32377 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32378}
32379
32380static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32381 SelectionDAG &DAG) {
32382 SDValue Src = Op.getOperand(0);
32383 MVT SrcVT = Src.getSimpleValueType();
32384 MVT DstVT = Op.getSimpleValueType();
32385
32386 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32387 // half to v32i1 and concatenating the result.
32388 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32389 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32390 assert(Subtarget.hasBWI() && "Expected BWI target");
32391 SDLoc dl(Op);
32392 SDValue Lo, Hi;
32393 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32394 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32395 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32396 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32397 }
32398
32399 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32400 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32401 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32402 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32403 SDLoc DL(Op);
32404 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32405 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32406 return DAG.getZExtOrTrunc(V, DL, DstVT);
32407 }
32408
32409 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32410 SrcVT == MVT::i64) && "Unexpected VT!");
32411
32412 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32413 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32414 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32415 // This conversion needs to be expanded.
32416 return SDValue();
32417
32418 SDLoc dl(Op);
32419 if (SrcVT.isVector()) {
32420 // Widen the vector in input in the case of MVT::v2i32.
32421 // Example: from MVT::v2i32 to MVT::v4i32.
32423 SrcVT.getVectorNumElements() * 2);
32424 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32425 DAG.getUNDEF(SrcVT));
32426 } else {
32427 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32428 "Unexpected source type in LowerBITCAST");
32429 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32430 }
32431
32432 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32433 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32434
32435 if (DstVT == MVT::x86mmx)
32436 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32437
32438 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32439 DAG.getVectorIdxConstant(0, dl));
32440}
32441
32442/// Compute the horizontal sum of bytes in V for the elements of VT.
32443///
32444/// Requires V to be a byte vector and VT to be an integer vector type with
32445/// wider elements than V's type. The width of the elements of VT determines
32446/// how many bytes of V are summed horizontally to produce each element of the
32447/// result.
32449 const X86Subtarget &Subtarget,
32450 SelectionDAG &DAG) {
32451 SDLoc DL(V);
32452 MVT ByteVecVT = V.getSimpleValueType();
32453 MVT EltVT = VT.getVectorElementType();
32454 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32455 "Expected value to have byte element type.");
32456 assert(EltVT != MVT::i8 &&
32457 "Horizontal byte sum only makes sense for wider elements!");
32458 unsigned VecSize = VT.getSizeInBits();
32459 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32460
32461 // PSADBW instruction horizontally add all bytes and leave the result in i64
32462 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32463 if (EltVT == MVT::i64) {
32464 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32465 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32466 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32467 return DAG.getBitcast(VT, V);
32468 }
32469
32470 if (EltVT == MVT::i32) {
32471 // We unpack the low half and high half into i32s interleaved with zeros so
32472 // that we can use PSADBW to horizontally sum them. The most useful part of
32473 // this is that it lines up the results of two PSADBW instructions to be
32474 // two v2i64 vectors which concatenated are the 4 population counts. We can
32475 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32476 SDValue Zeros = DAG.getConstant(0, DL, VT);
32477 SDValue V32 = DAG.getBitcast(VT, V);
32478 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32479 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32480
32481 // Do the horizontal sums into two v2i64s.
32482 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32483 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32484 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32485 DAG.getBitcast(ByteVecVT, Low), Zeros);
32486 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32487 DAG.getBitcast(ByteVecVT, High), Zeros);
32488
32489 // Merge them together.
32490 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32491 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32492 DAG.getBitcast(ShortVecVT, Low),
32493 DAG.getBitcast(ShortVecVT, High));
32494
32495 return DAG.getBitcast(VT, V);
32496 }
32497
32498 // The only element type left is i16.
32499 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32500
32501 // To obtain pop count for each i16 element starting from the pop count for
32502 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32503 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32504 // directly supported.
32505 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32506 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32507 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32508 DAG.getBitcast(ByteVecVT, V));
32509 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32510}
32511
32513 const X86Subtarget &Subtarget,
32514 SelectionDAG &DAG) {
32515 MVT VT = Op.getSimpleValueType();
32516 MVT EltVT = VT.getVectorElementType();
32517 int NumElts = VT.getVectorNumElements();
32518 (void)EltVT;
32519 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32520
32521 // Implement a lookup table in register by using an algorithm based on:
32522 // http://wm.ite.pl/articles/sse-popcount.html
32523 //
32524 // The general idea is that every lower byte nibble in the input vector is an
32525 // index into a in-register pre-computed pop count table. We then split up the
32526 // input vector in two new ones: (1) a vector with only the shifted-right
32527 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32528 // masked out higher ones) for each byte. PSHUFB is used separately with both
32529 // to index the in-register table. Next, both are added and the result is a
32530 // i8 vector where each element contains the pop count for input byte.
32531 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32532 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32533 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32534 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32535
32537 for (int i = 0; i < NumElts; ++i)
32538 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32539 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32540 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32541
32542 // High nibbles
32543 SDValue FourV = DAG.getConstant(4, DL, VT);
32544 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32545
32546 // Low nibbles
32547 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32548
32549 // The input vector is used as the shuffle mask that index elements into the
32550 // LUT. After counting low and high nibbles, add the vector to obtain the
32551 // final pop count per i8 element.
32552 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32553 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32554 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32555}
32556
32557// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32558// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32560 const X86Subtarget &Subtarget,
32561 SelectionDAG &DAG) {
32562 MVT VT = Op.getSimpleValueType();
32563 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32564 "Unknown CTPOP type to handle");
32565 SDValue Op0 = Op.getOperand(0);
32566
32567 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32568 if (Subtarget.hasVPOPCNTDQ()) {
32569 unsigned NumElems = VT.getVectorNumElements();
32570 assert((VT.getVectorElementType() == MVT::i8 ||
32571 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32572 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32573 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32574 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32575 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32576 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32577 }
32578 }
32579
32580 // Decompose 256-bit ops into smaller 128-bit ops.
32581 if (VT.is256BitVector() && !Subtarget.hasInt256())
32582 return splitVectorIntUnary(Op, DAG, DL);
32583
32584 // Decompose 512-bit ops into smaller 256-bit ops.
32585 if (VT.is512BitVector() && !Subtarget.hasBWI())
32586 return splitVectorIntUnary(Op, DAG, DL);
32587
32588 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32589 if (VT.getScalarType() != MVT::i8) {
32590 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32591 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32592 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32593 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32594 }
32595
32596 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32597 if (!Subtarget.hasSSSE3())
32598 return SDValue();
32599
32600 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32601}
32602
32603static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32604 SelectionDAG &DAG) {
32605 MVT VT = N.getSimpleValueType();
32606 SDValue Op = N.getOperand(0);
32607 SDLoc DL(N);
32608
32609 if (VT.isScalarInteger()) {
32610 // Compute the lower/upper bounds of the active bits of the value,
32611 // allowing us to shift the active bits down if necessary to fit into the
32612 // special cases below.
32613 KnownBits Known = DAG.computeKnownBits(Op);
32614 if (Known.isConstant())
32615 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32616 unsigned LZ = Known.countMinLeadingZeros();
32617 unsigned TZ = Known.countMinTrailingZeros();
32618 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32619 unsigned ActiveBits = Known.getBitWidth() - LZ;
32620 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32621
32622 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32623 if (ShiftedActiveBits <= 2) {
32624 if (ActiveBits > 2)
32625 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32626 DAG.getShiftAmountConstant(TZ, VT, DL));
32627 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32628 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32629 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32630 DAG.getShiftAmountConstant(1, VT, DL)));
32631 return DAG.getZExtOrTrunc(Op, DL, VT);
32632 }
32633
32634 // i3 CTPOP - perform LUT into i32 integer.
32635 if (ShiftedActiveBits <= 3) {
32636 if (ActiveBits > 3)
32637 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32638 DAG.getShiftAmountConstant(TZ, VT, DL));
32639 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32640 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32641 DAG.getShiftAmountConstant(1, VT, DL));
32642 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32643 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32644 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32645 DAG.getConstant(0x3, DL, MVT::i32));
32646 return DAG.getZExtOrTrunc(Op, DL, VT);
32647 }
32648
32649 // i4 CTPOP - perform LUT into i64 integer.
32650 if (ShiftedActiveBits <= 4 &&
32651 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32652 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32653 if (ActiveBits > 4)
32654 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32655 DAG.getShiftAmountConstant(TZ, VT, DL));
32656 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32657 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32658 DAG.getConstant(4, DL, MVT::i32));
32659 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32660 DAG.getShiftAmountOperand(MVT::i64, Op));
32661 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32662 DAG.getConstant(0x7, DL, MVT::i64));
32663 return DAG.getZExtOrTrunc(Op, DL, VT);
32664 }
32665
32666 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32667 if (ShiftedActiveBits <= 8) {
32668 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32669 if (ActiveBits > 8)
32670 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32671 DAG.getShiftAmountConstant(TZ, VT, DL));
32672 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32673 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32674 DAG.getConstant(0x08040201U, DL, MVT::i32));
32675 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32676 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32677 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32678 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32679 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32680 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32681 return DAG.getZExtOrTrunc(Op, DL, VT);
32682 }
32683
32684 return SDValue(); // fallback to generic expansion.
32685 }
32686
32687 assert(VT.isVector() &&
32688 "We only do custom lowering for vector population count.");
32689 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32690}
32691
32693 MVT VT = Op.getSimpleValueType();
32694 SDValue In = Op.getOperand(0);
32695 SDLoc DL(Op);
32696
32697 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32698 // perform the BITREVERSE.
32699 if (!VT.isVector()) {
32700 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32701 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32702 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32703 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32704 DAG.getVectorIdxConstant(0, DL));
32705 }
32706
32707 int NumElts = VT.getVectorNumElements();
32708 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32709
32710 // Decompose 256-bit ops into smaller 128-bit ops.
32711 if (VT.is256BitVector())
32712 return splitVectorIntUnary(Op, DAG, DL);
32713
32714 assert(VT.is128BitVector() &&
32715 "Only 128-bit vector bitreverse lowering supported.");
32716
32717 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32718 // perform the BSWAP in the shuffle.
32719 // Its best to shuffle using the second operand as this will implicitly allow
32720 // memory folding for multiple vectors.
32721 SmallVector<SDValue, 16> MaskElts;
32722 for (int i = 0; i != NumElts; ++i) {
32723 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32724 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32725 int PermuteByte = SourceByte | (2 << 5);
32726 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32727 }
32728 }
32729
32730 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32731 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32732 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32733 Res, Mask);
32734 return DAG.getBitcast(VT, Res);
32735}
32736
32738 SelectionDAG &DAG) {
32739 MVT VT = Op.getSimpleValueType();
32740
32741 if (Subtarget.hasXOP() && !VT.is512BitVector())
32742 return LowerBITREVERSE_XOP(Op, DAG);
32743
32744 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32745 "SSSE3 or GFNI required for BITREVERSE");
32746
32747 SDValue In = Op.getOperand(0);
32748 SDLoc DL(Op);
32749
32750 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32751 if (VT.is512BitVector() && !Subtarget.hasBWI())
32752 return splitVectorIntUnary(Op, DAG, DL);
32753
32754 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32755 if (VT.is256BitVector() && !Subtarget.hasInt256())
32756 return splitVectorIntUnary(Op, DAG, DL);
32757
32758 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32759 if (!VT.isVector()) {
32760 assert(
32761 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32762 "Only tested for i8/i16/i32/i64");
32763 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32764 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32765 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32766 DAG.getBitcast(MVT::v16i8, Res));
32767 Res =
32768 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32769 DAG.getVectorIdxConstant(0, DL));
32770 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32771 }
32772
32773 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32774
32775 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32776 if (VT.getScalarType() != MVT::i8) {
32777 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32778 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32779 Res = DAG.getBitcast(ByteVT, Res);
32780 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32781 return DAG.getBitcast(VT, Res);
32782 }
32783 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32784 "Only byte vector BITREVERSE supported");
32785
32786 unsigned NumElts = VT.getVectorNumElements();
32787
32788 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32789 if (Subtarget.hasGFNI()) {
32791 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32792 DAG.getTargetConstant(0, DL, MVT::i8));
32793 }
32794
32795 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32796 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32797 // 0-15 value (moved to the other nibble).
32798 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32799 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32800 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32801
32802 const int LoLUT[16] = {
32803 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32804 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32805 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32806 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32807 const int HiLUT[16] = {
32808 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32809 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32810 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32811 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32812
32813 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32814 for (unsigned i = 0; i < NumElts; ++i) {
32815 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32816 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32817 }
32818
32819 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32820 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32821 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32822 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32823 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32824}
32825
32826static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32827 SelectionDAG &DAG) {
32828 SDLoc DL(Op);
32829 SDValue X = Op.getOperand(0);
32830 MVT VT = Op.getSimpleValueType();
32831
32832 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32833 if (VT == MVT::i8 ||
32835 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32836 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32837 DAG.getConstant(0, DL, MVT::i8));
32838 // Copy the inverse of the parity flag into a register with setcc.
32839 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32840 // Extend to the original type.
32841 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32842 }
32843
32844 // If we have POPCNT, use the default expansion.
32845 if (Subtarget.hasPOPCNT())
32846 return SDValue();
32847
32848 if (VT == MVT::i64) {
32849 // Xor the high and low 16-bits together using a 32-bit operation.
32850 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32851 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32852 DAG.getConstant(32, DL, MVT::i8)));
32853 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32854 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32855 }
32856
32857 if (VT != MVT::i16) {
32858 // Xor the high and low 16-bits together using a 32-bit operation.
32859 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32860 DAG.getConstant(16, DL, MVT::i8));
32861 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32862 } else {
32863 // If the input is 16-bits, we need to extend to use an i32 shift below.
32864 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32865 }
32866
32867 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32868 // This should allow an h-reg to be used to save a shift.
32869 SDValue Hi = DAG.getNode(
32870 ISD::TRUNCATE, DL, MVT::i8,
32871 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32872 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32873 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32874 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32875
32876 // Copy the inverse of the parity flag into a register with setcc.
32877 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32878 // Extend to the original type.
32879 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32880}
32881
32883 const X86Subtarget &Subtarget) {
32884 unsigned NewOpc = 0;
32885 switch (N->getOpcode()) {
32886 case ISD::ATOMIC_LOAD_ADD:
32887 NewOpc = X86ISD::LADD;
32888 break;
32889 case ISD::ATOMIC_LOAD_SUB:
32890 NewOpc = X86ISD::LSUB;
32891 break;
32892 case ISD::ATOMIC_LOAD_OR:
32893 NewOpc = X86ISD::LOR;
32894 break;
32895 case ISD::ATOMIC_LOAD_XOR:
32896 NewOpc = X86ISD::LXOR;
32897 break;
32898 case ISD::ATOMIC_LOAD_AND:
32899 NewOpc = X86ISD::LAND;
32900 break;
32901 default:
32902 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32903 }
32904
32905 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32906
32907 return DAG.getMemIntrinsicNode(
32908 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32909 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32910 /*MemVT=*/N->getSimpleValueType(0), MMO);
32911}
32912
32913/// Lower atomic_load_ops into LOCK-prefixed operations.
32915 const X86Subtarget &Subtarget) {
32916 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32917 SDValue Chain = N->getOperand(0);
32918 SDValue LHS = N->getOperand(1);
32919 SDValue RHS = N->getOperand(2);
32920 unsigned Opc = N->getOpcode();
32921 MVT VT = N->getSimpleValueType(0);
32922 SDLoc DL(N);
32923
32924 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32925 // can only be lowered when the result is unused. They should have already
32926 // been transformed into a cmpxchg loop in AtomicExpand.
32927 if (N->hasAnyUseOfValue(0)) {
32928 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32929 // select LXADD if LOCK_SUB can't be selected.
32930 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32931 // can use LXADD as opposed to cmpxchg.
32932 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32933 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32934 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32935 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32936
32937 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32938 "Used AtomicRMW ops other than Add should have been expanded!");
32939 return N;
32940 }
32941
32942 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32943 // The core idea here is that since the memory location isn't actually
32944 // changing, all we need is a lowering for the *ordering* impacts of the
32945 // atomicrmw. As such, we can chose a different operation and memory
32946 // location to minimize impact on other code.
32947 // The above holds unless the node is marked volatile in which
32948 // case it needs to be preserved according to the langref.
32949 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32950 // On X86, the only ordering which actually requires an instruction is
32951 // seq_cst which isn't SingleThread, everything just needs to be preserved
32952 // during codegen and then dropped. Note that we expect (but don't assume),
32953 // that orderings other than seq_cst and acq_rel have been canonicalized to
32954 // a store or load.
32957 // Prefer a locked operation against a stack location to minimize cache
32958 // traffic. This assumes that stack locations are very likely to be
32959 // accessed only by the owning thread.
32960 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32961 assert(!N->hasAnyUseOfValue(0));
32962 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32963 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32964 DAG.getUNDEF(VT), NewChain);
32965 }
32966 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32967 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32968 assert(!N->hasAnyUseOfValue(0));
32969 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32970 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32971 DAG.getUNDEF(VT), NewChain);
32972 }
32973
32974 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32975 // RAUW the chain, but don't worry about the result, as it's unused.
32976 assert(!N->hasAnyUseOfValue(0));
32977 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32978 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32979 DAG.getUNDEF(VT), LockOp.getValue(1));
32980}
32981
32983 const X86Subtarget &Subtarget) {
32984 auto *Node = cast<AtomicSDNode>(Op.getNode());
32985 SDLoc dl(Node);
32986 EVT VT = Node->getMemoryVT();
32987
32988 bool IsSeqCst =
32989 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32990 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32991
32992 // If this store is not sequentially consistent and the type is legal
32993 // we can just keep it.
32994 if (!IsSeqCst && IsTypeLegal)
32995 return Op;
32996
32997 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32999 Attribute::NoImplicitFloat)) {
33000 SDValue Chain;
33001 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33002 // vector store.
33003 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33004 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33005 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33006 Node->getMemOperand());
33007 }
33008
33009 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33010 // is enabled.
33011 if (VT == MVT::i64) {
33012 if (Subtarget.hasSSE1()) {
33013 SDValue SclToVec =
33014 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33015 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33016 SclToVec = DAG.getBitcast(StVT, SclToVec);
33017 SDVTList Tys = DAG.getVTList(MVT::Other);
33018 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33019 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33020 MVT::i64, Node->getMemOperand());
33021 } else if (Subtarget.hasX87()) {
33022 // First load this into an 80-bit X87 register using a stack temporary.
33023 // This will put the whole integer into the significand.
33024 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33026 MachinePointerInfo MPI =
33028 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33030 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33031 SDValue LdOps[] = {Chain, StackPtr};
33033 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33034 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33035 Chain = Value.getValue(1);
33036
33037 // Now use an FIST to do the atomic store.
33038 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33039 Chain =
33040 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33041 StoreOps, MVT::i64, Node->getMemOperand());
33042 }
33043 }
33044
33045 if (Chain) {
33046 // If this is a sequentially consistent store, also emit an appropriate
33047 // barrier.
33048 if (IsSeqCst)
33049 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33050
33051 return Chain;
33052 }
33053 }
33054
33055 // Convert seq_cst store -> xchg
33056 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33057 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33058 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33059 Node->getOperand(0), Node->getOperand(2),
33060 Node->getOperand(1), Node->getMemOperand());
33061 return Swap.getValue(1);
33062}
33063
33065 SDNode *N = Op.getNode();
33066 MVT VT = N->getSimpleValueType(0);
33067 unsigned Opc = Op.getOpcode();
33068
33069 // Let legalize expand this if it isn't a legal type yet.
33070 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33071 return SDValue();
33072
33073 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33074 SDLoc DL(N);
33075
33076 // Set the carry flag.
33077 SDValue Carry = Op.getOperand(2);
33078 EVT CarryVT = Carry.getValueType();
33079 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33080 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33081
33082 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33083 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33084 Op.getOperand(0), Op.getOperand(1),
33085 Carry.getValue(1));
33086
33087 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33088 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33089 Sum.getValue(1), DL, DAG);
33090 if (N->getValueType(1) == MVT::i1)
33091 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33092
33093 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33094}
33095
33096static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33097 SelectionDAG &DAG) {
33098 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33099
33100 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33101 // which returns the values as { float, float } (in XMM0) or
33102 // { double, double } (which is returned in XMM0, XMM1).
33103 SDLoc dl(Op);
33104 SDValue Arg = Op.getOperand(0);
33105 EVT ArgVT = Arg.getValueType();
33106 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33107
33109 Args.emplace_back(Arg, ArgTy);
33110
33111 bool isF64 = ArgVT == MVT::f64;
33112 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33113 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33114 // the results are returned via SRet in memory.
33115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33116 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33117 const char *LibcallName = TLI.getLibcallName(LC);
33118 SDValue Callee =
33119 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33120
33121 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33122 : (Type *)FixedVectorType::get(ArgTy, 4);
33123
33125 CLI.setDebugLoc(dl)
33126 .setChain(DAG.getEntryNode())
33127 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33128
33129 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33130
33131 if (isF64)
33132 // Returned in xmm0 and xmm1.
33133 return CallResult.first;
33134
33135 // Returned in bits 0:31 and 32:64 xmm0.
33136 SDValue SinVal =
33137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33138 DAG.getVectorIdxConstant(0, dl));
33139 SDValue CosVal =
33140 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33141 DAG.getVectorIdxConstant(1, dl));
33142 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33143 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33144}
33145
33146/// Widen a vector input to a vector of NVT. The
33147/// input vector must have the same element type as NVT.
33149 bool FillWithZeroes = false) {
33150 // Check if InOp already has the right width.
33151 MVT InVT = InOp.getSimpleValueType();
33152 if (InVT == NVT)
33153 return InOp;
33154
33155 if (InOp.isUndef())
33156 return DAG.getUNDEF(NVT);
33157
33159 "input and widen element type must match");
33160
33161 unsigned InNumElts = InVT.getVectorNumElements();
33162 unsigned WidenNumElts = NVT.getVectorNumElements();
33163 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33164 "Unexpected request for vector widening");
33165
33166 SDLoc dl(InOp);
33167 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33168 SDValue N1 = InOp.getOperand(1);
33169 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33170 N1.isUndef()) {
33171 InOp = InOp.getOperand(0);
33172 InVT = InOp.getSimpleValueType();
33173 InNumElts = InVT.getVectorNumElements();
33174 }
33175 }
33178 EVT EltVT = InOp.getOperand(0).getValueType();
33179 SDValue FillVal =
33180 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33182 Ops.append(WidenNumElts - InNumElts, FillVal);
33183 return DAG.getBuildVector(NVT, dl, Ops);
33184 }
33185 SDValue FillVal =
33186 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33187 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33188 DAG.getVectorIdxConstant(0, dl));
33189}
33190
33192 SelectionDAG &DAG) {
33193 assert(Subtarget.hasAVX512() &&
33194 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33195
33197 SDValue Src = N->getValue();
33198 MVT VT = Src.getSimpleValueType();
33199 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33200 SDLoc dl(Op);
33201
33202 SDValue Scale = N->getScale();
33203 SDValue Index = N->getIndex();
33204 SDValue Mask = N->getMask();
33205 SDValue Chain = N->getChain();
33206 SDValue BasePtr = N->getBasePtr();
33207
33208 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33209 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33210 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33211 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33212 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33213 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33214 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33215 SDVTList VTs = DAG.getVTList(MVT::Other);
33216 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33217 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33218 N->getMemoryVT(), N->getMemOperand());
33219 }
33220 return SDValue();
33221 }
33222
33223 MVT IndexVT = Index.getSimpleValueType();
33224
33225 // If the index is v2i32, we're being called by type legalization and we
33226 // should just let the default handling take care of it.
33227 if (IndexVT == MVT::v2i32)
33228 return SDValue();
33229
33230 // If we don't have VLX and neither the passthru or index is 512-bits, we
33231 // need to widen until one is.
33232 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33233 !Index.getSimpleValueType().is512BitVector()) {
33234 // Determine how much we need to widen by to get a 512-bit type.
33235 unsigned Factor = std::min(512/VT.getSizeInBits(),
33236 512/IndexVT.getSizeInBits());
33237 unsigned NumElts = VT.getVectorNumElements() * Factor;
33238
33239 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33240 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33241 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33242
33243 Src = ExtendToType(Src, VT, DAG);
33244 Index = ExtendToType(Index, IndexVT, DAG);
33245 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33246 }
33247
33248 SDVTList VTs = DAG.getVTList(MVT::Other);
33249 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33250 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33251 N->getMemoryVT(), N->getMemOperand());
33252}
33253
33254static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33255 SelectionDAG &DAG) {
33256
33258 MVT VT = Op.getSimpleValueType();
33259 MVT ScalarVT = VT.getScalarType();
33260 SDValue Mask = N->getMask();
33261 MVT MaskVT = Mask.getSimpleValueType();
33262 SDValue PassThru = N->getPassThru();
33263 SDLoc dl(Op);
33264
33265 // Handle AVX masked loads which don't support passthru other than 0.
33266 if (MaskVT.getVectorElementType() != MVT::i1) {
33267 // We also allow undef in the isel pattern.
33268 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33269 return Op;
33270
33271 SDValue NewLoad = DAG.getMaskedLoad(
33272 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33273 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33274 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33275 N->isExpandingLoad());
33276 // Emit a blend.
33277 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33278 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33279 }
33280
33281 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33282 "Expanding masked load is supported on AVX-512 target only!");
33283
33284 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33285 "Expanding masked load is supported for 32 and 64-bit types only!");
33286
33287 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33288 "Cannot lower masked load op.");
33289
33290 assert((ScalarVT.getSizeInBits() >= 32 ||
33291 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33292 ScalarVT == MVT::f16))) &&
33293 "Unsupported masked load op.");
33294
33295 // This operation is legal for targets with VLX, but without
33296 // VLX the vector should be widened to 512 bit
33297 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33298 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33299 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33300
33301 // Mask element has to be i1.
33302 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33303 "Unexpected mask type");
33304
33305 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33306
33307 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33308 SDValue NewLoad = DAG.getMaskedLoad(
33309 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33310 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33311 N->getExtensionType(), N->isExpandingLoad());
33312
33313 SDValue Extract =
33314 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33315 DAG.getVectorIdxConstant(0, dl));
33316 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33317 return DAG.getMergeValues(RetOps, dl);
33318}
33319
33320static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33321 SelectionDAG &DAG) {
33323 SDValue DataToStore = N->getValue();
33324 MVT VT = DataToStore.getSimpleValueType();
33325 MVT ScalarVT = VT.getScalarType();
33326 SDValue Mask = N->getMask();
33327 SDLoc dl(Op);
33328
33329 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33330 "Expanding masked load is supported on AVX-512 target only!");
33331
33332 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33333 "Expanding masked load is supported for 32 and 64-bit types only!");
33334
33335 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33336 "Cannot lower masked store op.");
33337
33338 assert((ScalarVT.getSizeInBits() >= 32 ||
33339 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33340 ScalarVT == MVT::f16))) &&
33341 "Unsupported masked store op.");
33342
33343 // This operation is legal for targets with VLX, but without
33344 // VLX the vector should be widened to 512 bit
33345 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33346 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33347
33348 // Mask element has to be i1.
33349 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33350 "Unexpected mask type");
33351
33352 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33353
33354 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33355 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33356 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33357 N->getOffset(), Mask, N->getMemoryVT(),
33358 N->getMemOperand(), N->getAddressingMode(),
33359 N->isTruncatingStore(), N->isCompressingStore());
33360}
33361
33362static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33363 SelectionDAG &DAG) {
33364 assert(Subtarget.hasAVX2() &&
33365 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33366
33368 SDLoc dl(Op);
33369 MVT VT = Op.getSimpleValueType();
33370 SDValue Index = N->getIndex();
33371 SDValue Mask = N->getMask();
33372 SDValue PassThru = N->getPassThru();
33373 MVT IndexVT = Index.getSimpleValueType();
33374
33375 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33376
33377 // If the index is v2i32, we're being called by type legalization.
33378 if (IndexVT == MVT::v2i32)
33379 return SDValue();
33380
33381 // If we don't have VLX and neither the passthru or index is 512-bits, we
33382 // need to widen until one is.
33383 MVT OrigVT = VT;
33384 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33385 !IndexVT.is512BitVector()) {
33386 // Determine how much we need to widen by to get a 512-bit type.
33387 unsigned Factor = std::min(512/VT.getSizeInBits(),
33388 512/IndexVT.getSizeInBits());
33389
33390 unsigned NumElts = VT.getVectorNumElements() * Factor;
33391
33392 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33393 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33394 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33395
33396 PassThru = ExtendToType(PassThru, VT, DAG);
33397 Index = ExtendToType(Index, IndexVT, DAG);
33398 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33399 }
33400
33401 // Break dependency on the data register.
33402 if (PassThru.isUndef())
33403 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33404
33405 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33406 N->getScale() };
33407 SDValue NewGather = DAG.getMemIntrinsicNode(
33408 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33409 N->getMemOperand());
33410 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33411 DAG.getVectorIdxConstant(0, dl));
33412 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33413}
33414
33416 SDLoc dl(Op);
33417 SDValue Src = Op.getOperand(0);
33418 MVT DstVT = Op.getSimpleValueType();
33419
33421 unsigned SrcAS = N->getSrcAddressSpace();
33422
33423 assert(SrcAS != N->getDestAddressSpace() &&
33424 "addrspacecast must be between different address spaces");
33425
33426 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33427 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33428 } else if (DstVT == MVT::i64) {
33429 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33430 } else if (DstVT == MVT::i32) {
33431 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33432 } else {
33433 report_fatal_error("Bad address space in addrspacecast");
33434 }
33435 return Op;
33436}
33437
33438SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33439 SelectionDAG &DAG) const {
33440 // TODO: Eventually, the lowering of these nodes should be informed by or
33441 // deferred to the GC strategy for the function in which they appear. For
33442 // now, however, they must be lowered to something. Since they are logically
33443 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33444 // require special handling for these nodes), lower them as literal NOOPs for
33445 // the time being.
33447 Ops.push_back(Op.getOperand(0));
33448 if (Op->getGluedNode())
33449 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33450
33451 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33452 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33453}
33454
33455// Custom split CVTPS2PH with wide types.
33457 SDLoc dl(Op);
33458 EVT VT = Op.getValueType();
33459 SDValue Lo, Hi;
33460 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33461 EVT LoVT, HiVT;
33462 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33463 SDValue RC = Op.getOperand(1);
33464 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33465 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33466 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33467}
33468
33470 SelectionDAG &DAG) {
33471 unsigned IsData = Op.getConstantOperandVal(4);
33472
33473 // We don't support non-data prefetch without PREFETCHI.
33474 // Just preserve the chain.
33475 if (!IsData && !Subtarget.hasPREFETCHI())
33476 return Op.getOperand(0);
33477
33478 return Op;
33479}
33480
33482 SDNode *N = Op.getNode();
33483 SDValue Operand = N->getOperand(0);
33484 EVT VT = Operand.getValueType();
33485 SDLoc dl(N);
33486
33487 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33488
33489 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33490 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33491 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33492 // promote this operator's result!
33493 SDValue Chain = DAG.getEntryNode();
33494 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33495 {Chain, Operand, One});
33496 return StrictFmul;
33497}
33498
33500 unsigned OpNo) {
33501 const APInt Operand(32, OpNo);
33502 std::string OpNoStr = llvm::toString(Operand, 10, false);
33503 std::string Str(" $");
33504
33505 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33506 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33507
33508 auto I = StringRef::npos;
33509 for (auto &AsmStr : AsmStrs) {
33510 // Match the OpNo string. We should match exactly to exclude match
33511 // sub-string, e.g. "$12" contain "$1"
33512 if (AsmStr.ends_with(OpNoStr1))
33513 I = AsmStr.size() - OpNoStr1.size();
33514
33515 // Get the index of operand in AsmStr.
33516 if (I == StringRef::npos)
33517 I = AsmStr.find(OpNoStr1 + ",");
33518 if (I == StringRef::npos)
33519 I = AsmStr.find(OpNoStr2);
33520
33521 if (I == StringRef::npos)
33522 continue;
33523
33524 assert(I > 0 && "Unexpected inline asm string!");
33525 // Remove the operand string and label (if exsit).
33526 // For example:
33527 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33528 // ==>
33529 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33530 // ==>
33531 // "call dword ptr "
33532 auto TmpStr = AsmStr.substr(0, I);
33533 I = TmpStr.rfind(':');
33534 if (I != StringRef::npos)
33535 TmpStr = TmpStr.substr(I + 1);
33536 return TmpStr.take_while(llvm::isAlpha);
33537 }
33538
33539 return StringRef();
33540}
33541
33543 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33544 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33545 // changed from indirect TargetLowering::C_Memory to direct
33546 // TargetLowering::C_Address.
33547 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33548 // location.
33549 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33550 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33551}
33552
33554 SDValue Mask) {
33555 EVT Ty = MVT::i8;
33556 auto V = DAG.getBitcast(MVT::i1, Mask);
33557 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33558 auto Zero = DAG.getConstant(0, DL, Ty);
33559 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33560 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33561 return SDValue(CmpZero.getNode(), 1);
33562}
33563
33565 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33566 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33567 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33568 // ->
33569 // _, flags = SUB 0, mask
33570 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33571 // bit_cast_to_vector<res>
33572 EVT VTy = PassThru.getValueType();
33573 EVT Ty = VTy.getVectorElementType();
33574 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33575 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33576 : DAG.getBitcast(Ty, PassThru);
33577 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33578 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33579 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33580 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33581 return DAG.getBitcast(VTy, NewLoad);
33582}
33583
33585 SDValue Chain,
33587 SDValue Val, SDValue Mask) const {
33588 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33589 // ->
33590 // _, flags = SUB 0, mask
33591 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33593 SDVTList Tys = DAG.getVTList(MVT::Other);
33594 auto ScalarVal = DAG.getBitcast(Ty, Val);
33595 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33596 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33597 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33598 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33599}
33600
33601/// Provide custom lowering hooks for some operations.
33603 switch (Op.getOpcode()) {
33604 // clang-format off
33605 default: llvm_unreachable("Should not custom lower this!");
33606 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33607 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33608 return LowerCMP_SWAP(Op, Subtarget, DAG);
33609 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33610 case ISD::ATOMIC_LOAD_ADD:
33611 case ISD::ATOMIC_LOAD_SUB:
33612 case ISD::ATOMIC_LOAD_OR:
33613 case ISD::ATOMIC_LOAD_XOR:
33614 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33615 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33616 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33617 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33618 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33619 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33620 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33621 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33622 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33623 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33624 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33625 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33626 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33627 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33628 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33629 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33630 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33631 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33632 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33633 case ISD::SHL_PARTS:
33634 case ISD::SRA_PARTS:
33635 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33636 case ISD::FSHL:
33637 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33638 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33640 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33642 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33643 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33644 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33645 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33646 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33649 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33650 case ISD::FP_TO_SINT:
33652 case ISD::FP_TO_UINT:
33653 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33655 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33656 case ISD::FP_EXTEND:
33657 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33658 case ISD::FP_ROUND:
33659 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33660 case ISD::FP16_TO_FP:
33661 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33662 case ISD::FP_TO_FP16:
33663 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33664 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33665 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33666 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33667 case ISD::FADD:
33668 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33669 case ISD::FROUND: return LowerFROUND(Op, DAG);
33670 case ISD::FABS:
33671 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33672 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33673 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33674 case ISD::LRINT:
33675 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33676 case ISD::SETCC:
33677 case ISD::STRICT_FSETCC:
33678 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33679 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33680 case ISD::SELECT: return LowerSELECT(Op, DAG);
33681 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33682 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33683 case ISD::VASTART: return LowerVASTART(Op, DAG);
33684 case ISD::VAARG: return LowerVAARG(Op, DAG);
33685 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33686 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33688 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33689 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33690 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33691 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33693 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33694 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33695 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33696 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33697 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33699 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33700 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33701 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33702 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33703 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33704 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33705 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33706 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33707 case ISD::CTLZ:
33708 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33709 case ISD::CTTZ:
33710 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33711 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33712 case ISD::MULHS:
33713 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33714 case ISD::ROTL:
33715 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33716 case ISD::SRA:
33717 case ISD::SRL:
33718 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33719 case ISD::SADDO:
33720 case ISD::UADDO:
33721 case ISD::SSUBO:
33722 case ISD::USUBO: return LowerXALUO(Op, DAG);
33723 case ISD::SMULO:
33724 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33725 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33726 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33727 case ISD::SADDO_CARRY:
33728 case ISD::SSUBO_CARRY:
33729 case ISD::UADDO_CARRY:
33730 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33731 case ISD::ADD:
33732 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33733 case ISD::UADDSAT:
33734 case ISD::SADDSAT:
33735 case ISD::USUBSAT:
33736 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33737 case ISD::SMAX:
33738 case ISD::SMIN:
33739 case ISD::UMAX:
33740 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33741 case ISD::FMINIMUM:
33742 case ISD::FMAXIMUM:
33743 case ISD::FMINIMUMNUM:
33744 case ISD::FMAXIMUMNUM:
33745 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33746 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33747 case ISD::ABDS:
33748 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33749 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33750 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33751 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33752 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33753 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33754 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33755 case ISD::GC_TRANSITION_START:
33756 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33757 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33758 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33759 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33760 // clang-format on
33761 }
33762}
33763
33764/// Replace a node with an illegal result type with a new node built out of
33765/// custom code.
33768 SelectionDAG &DAG) const {
33769 SDLoc dl(N);
33770 unsigned Opc = N->getOpcode();
33771 switch (Opc) {
33772 default:
33773#ifndef NDEBUG
33774 dbgs() << "ReplaceNodeResults: ";
33775 N->dump(&DAG);
33776#endif
33777 llvm_unreachable("Do not know how to custom type legalize this operation!");
33778 case X86ISD::CVTPH2PS: {
33779 EVT VT = N->getValueType(0);
33780 SDValue Lo, Hi;
33781 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33782 EVT LoVT, HiVT;
33783 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33784 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33785 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33786 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33787 Results.push_back(Res);
33788 return;
33789 }
33791 EVT VT = N->getValueType(0);
33792 SDValue Lo, Hi;
33793 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33794 EVT LoVT, HiVT;
33795 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33796 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33797 {N->getOperand(0), Lo});
33798 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33799 {N->getOperand(0), Hi});
33800 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33801 Lo.getValue(1), Hi.getValue(1));
33802 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33803 Results.push_back(Res);
33804 Results.push_back(Chain);
33805 return;
33806 }
33807 case X86ISD::CVTPS2PH:
33808 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33809 return;
33810 case ISD::CTPOP: {
33811 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33812 // If we have at most 32 active bits, then perform as i32 CTPOP.
33813 // TODO: Perform this in generic legalizer?
33814 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33815 unsigned LZ = Known.countMinLeadingZeros();
33816 unsigned TZ = Known.countMinTrailingZeros();
33817 if ((LZ + TZ) >= 32) {
33818 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33819 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33820 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33821 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33822 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33823 Results.push_back(Op);
33824 return;
33825 }
33826 // Use a v2i64 if possible.
33827 bool NoImplicitFloatOps =
33829 Attribute::NoImplicitFloat);
33830 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33831 SDValue Wide =
33832 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33833 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33834 // Bit count should fit in 32-bits, extract it as that and then zero
33835 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33836 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33837 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33838 DAG.getVectorIdxConstant(0, dl));
33839 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33840 Results.push_back(Wide);
33841 }
33842 return;
33843 }
33844 case ISD::MUL: {
33845 EVT VT = N->getValueType(0);
33847 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33848 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33849 // elements are needed.
33850 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33851 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33852 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33853 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33854 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33855 unsigned NumConcats = 16 / VT.getVectorNumElements();
33856 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33857 ConcatOps[0] = Res;
33858 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33859 Results.push_back(Res);
33860 return;
33861 }
33862 case ISD::SMULO:
33863 case ISD::UMULO: {
33864 EVT VT = N->getValueType(0);
33866 VT == MVT::v2i32 && "Unexpected VT!");
33867 bool IsSigned = Opc == ISD::SMULO;
33868 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33869 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33870 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33871 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33872 // Extract the high 32 bits from each result using PSHUFD.
33873 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33874 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33875 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33876 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33877 DAG.getVectorIdxConstant(0, dl));
33878
33879 // Truncate the low bits of the result. This will become PSHUFD.
33880 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33881
33882 SDValue HiCmp;
33883 if (IsSigned) {
33884 // SMULO overflows if the high bits don't match the sign of the low.
33885 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33886 } else {
33887 // UMULO overflows if the high bits are non-zero.
33888 HiCmp = DAG.getConstant(0, dl, VT);
33889 }
33890 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33891
33892 // Widen the result with by padding with undef.
33893 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33894 DAG.getUNDEF(VT));
33895 Results.push_back(Res);
33896 Results.push_back(Ovf);
33897 return;
33898 }
33899 case X86ISD::VPMADDWD: {
33900 // Legalize types for X86ISD::VPMADDWD by widening.
33901 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33902
33903 EVT VT = N->getValueType(0);
33904 EVT InVT = N->getOperand(0).getValueType();
33905 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33906 "Expected a VT that divides into 128 bits.");
33908 "Unexpected type action!");
33909 unsigned NumConcat = 128 / InVT.getSizeInBits();
33910
33911 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33912 InVT.getVectorElementType(),
33913 NumConcat * InVT.getVectorNumElements());
33914 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33916 NumConcat * VT.getVectorNumElements());
33917
33918 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33919 Ops[0] = N->getOperand(0);
33920 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33921 Ops[0] = N->getOperand(1);
33922 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33923
33924 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33925 Results.push_back(Res);
33926 return;
33927 }
33928 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33929 case X86ISD::FMINC:
33930 case X86ISD::FMIN:
33931 case X86ISD::FMAXC:
33932 case X86ISD::FMAX:
33934 case X86ISD::STRICT_FMAX: {
33935 EVT VT = N->getValueType(0);
33936 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33937 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33938 SDValue UNDEF = DAG.getUNDEF(VT);
33939 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33940 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33941 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33942 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33943 SDValue Res;
33944 if (IsStrict)
33945 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33946 {N->getOperand(0), LHS, RHS});
33947 else
33948 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33949 Results.push_back(Res);
33950 if (IsStrict)
33951 Results.push_back(Res.getValue(1));
33952 return;
33953 }
33954 case ISD::SDIV:
33955 case ISD::UDIV:
33956 case ISD::SREM:
33957 case ISD::UREM: {
33958 EVT VT = N->getValueType(0);
33959 if (VT.isVector()) {
33961 "Unexpected type action!");
33962 // If this RHS is a constant splat vector we can widen this and let
33963 // division/remainder by constant optimize it.
33964 // TODO: Can we do something for non-splat?
33965 APInt SplatVal;
33966 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33967 unsigned NumConcats = 128 / VT.getSizeInBits();
33968 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33969 Ops0[0] = N->getOperand(0);
33970 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33971 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33972 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33973 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33974 Results.push_back(Res);
33975 }
33976 return;
33977 }
33978
33979 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33980 Results.push_back(V);
33981 return;
33982 }
33983 case ISD::TRUNCATE: {
33984 MVT VT = N->getSimpleValueType(0);
33985 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33986 return;
33987
33988 // The generic legalizer will try to widen the input type to the same
33989 // number of elements as the widened result type. But this isn't always
33990 // the best thing so do some custom legalization to avoid some cases.
33991 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33992 SDValue In = N->getOperand(0);
33993 EVT InVT = In.getValueType();
33994 EVT InEltVT = InVT.getVectorElementType();
33995 EVT EltVT = VT.getVectorElementType();
33996 unsigned MinElts = VT.getVectorNumElements();
33997 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33998 unsigned InBits = InVT.getSizeInBits();
33999
34000 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
34001 unsigned PackOpcode;
34002 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34003 Subtarget, N->getFlags())) {
34004 if (SDValue Res =
34005 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34006 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34007 Results.push_back(Res);
34008 return;
34009 }
34010 }
34011
34012 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34013 // 128 bit and smaller inputs should avoid truncate all together and
34014 // use a shuffle.
34015 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34016 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34017 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34018 for (unsigned I = 0; I < MinElts; ++I)
34019 TruncMask[I] = Scale * I;
34020 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34021 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34022 "Illegal vector type in truncation");
34023 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34024 Results.push_back(
34025 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34026 return;
34027 }
34028 }
34029
34030 // With AVX512 there are some cases that can use a target specific
34031 // truncate node to go from 256/512 to less than 128 with zeros in the
34032 // upper elements of the 128 bit result.
34033 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34034 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34035 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34036 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34037 return;
34038 }
34039 // There's one case we can widen to 512 bits and use VTRUNC.
34040 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34041 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34042 DAG.getUNDEF(MVT::v4i64));
34043 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34044 return;
34045 }
34046 }
34047 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34048 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34049 isTypeLegal(MVT::v4i64)) {
34050 // Input needs to be split and output needs to widened. Let's use two
34051 // VTRUNCs, and shuffle their results together into the wider type.
34052 SDValue Lo, Hi;
34053 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34054
34055 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34056 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34057 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34058 { 0, 1, 2, 3, 16, 17, 18, 19,
34059 -1, -1, -1, -1, -1, -1, -1, -1 });
34060 Results.push_back(Res);
34061 return;
34062 }
34063
34064 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34065 // this via type legalization.
34066 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34067 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34068 (!Subtarget.hasSSSE3() ||
34069 (!isTypeLegal(InVT) &&
34070 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34071 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34072 InEltVT.getSizeInBits() * WidenNumElts);
34073 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34074 return;
34075 }
34076
34077 return;
34078 }
34079 case ISD::ANY_EXTEND:
34080 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34081 // It's intended to custom handle the input type.
34082 assert(N->getValueType(0) == MVT::v8i8 &&
34083 "Do not know how to legalize this Node");
34084 return;
34085 case ISD::SIGN_EXTEND:
34086 case ISD::ZERO_EXTEND: {
34087 EVT VT = N->getValueType(0);
34088 SDValue In = N->getOperand(0);
34089 EVT InVT = In.getValueType();
34090 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34091 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34093 "Unexpected type action!");
34094 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34095 // Custom split this so we can extend i8/i16->i32 invec. This is better
34096 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34097 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34098 // we allow the sra from the extend to i32 to be shared by the split.
34099 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34100
34101 // Fill a vector with sign bits for each element.
34102 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34103 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34104
34105 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34106 // to v2i64.
34107 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34108 {0, 4, 1, 5});
34109 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34110 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34111 {2, 6, 3, 7});
34112 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34113
34114 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34115 Results.push_back(Res);
34116 return;
34117 }
34118
34119 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34120 if (!InVT.is128BitVector()) {
34121 // Not a 128 bit vector, but maybe type legalization will promote
34122 // it to 128 bits.
34123 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34124 return;
34125 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34126 if (!InVT.is128BitVector())
34127 return;
34128
34129 // Promote the input to 128 bits. Type legalization will turn this into
34130 // zext_inreg/sext_inreg.
34131 In = DAG.getNode(Opc, dl, InVT, In);
34132 }
34133
34134 // Perform custom splitting instead of the two stage extend we would get
34135 // by default.
34136 EVT LoVT, HiVT;
34137 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34138 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34139
34140 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34141
34142 // We need to shift the input over by half the number of elements.
34143 unsigned NumElts = InVT.getVectorNumElements();
34144 unsigned HalfNumElts = NumElts / 2;
34145 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34146 for (unsigned i = 0; i != HalfNumElts; ++i)
34147 ShufMask[i] = i + HalfNumElts;
34148
34149 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34150 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34151
34152 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34153 Results.push_back(Res);
34154 }
34155 return;
34156 }
34158 case ISD::FP_TO_UINT_SAT: {
34159 if (!Subtarget.hasAVX10_2())
34160 return;
34161
34162 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34163 EVT VT = N->getValueType(0);
34164 SDValue Op = N->getOperand(0);
34165 EVT OpVT = Op.getValueType();
34166 SDValue Res;
34167
34168 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34169 if (IsSigned)
34170 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34171 else
34172 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34173 Results.push_back(Res);
34174 }
34175 return;
34176 }
34177 case ISD::FP_TO_SINT:
34179 case ISD::FP_TO_UINT:
34181 bool IsStrict = N->isStrictFPOpcode();
34182 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34183 EVT VT = N->getValueType(0);
34184 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34185 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34186 EVT SrcVT = Src.getValueType();
34187
34188 SDValue Res;
34189 if (isSoftF16(SrcVT, Subtarget)) {
34190 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34191 if (IsStrict) {
34192 Res =
34193 DAG.getNode(Opc, dl, {VT, MVT::Other},
34194 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34195 {NVT, MVT::Other}, {Chain, Src})});
34196 Chain = Res.getValue(1);
34197 } else {
34198 Res =
34199 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34200 }
34201 Results.push_back(Res);
34202 if (IsStrict)
34203 Results.push_back(Chain);
34204
34205 return;
34206 }
34207
34208 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34209 SrcVT.getVectorElementType() == MVT::f16) {
34210 EVT EleVT = VT.getVectorElementType();
34211 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34212
34213 if (SrcVT != MVT::v8f16) {
34214 SDValue Tmp =
34215 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34216 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34217 Ops[0] = Src;
34218 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34219 }
34220
34221 if (IsStrict) {
34223 Res =
34224 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34225 Chain = Res.getValue(1);
34226 } else {
34227 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34228 Res = DAG.getNode(Opc, dl, ResVT, Src);
34229 }
34230
34231 // TODO: Need to add exception check code for strict FP.
34232 if (EleVT.getSizeInBits() < 16) {
34233 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34234 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34235
34236 // Now widen to 128 bits.
34237 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34238 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34239 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34240 ConcatOps[0] = Res;
34241 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34242 }
34243
34244 Results.push_back(Res);
34245 if (IsStrict)
34246 Results.push_back(Chain);
34247
34248 return;
34249 }
34250
34251 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34253 "Unexpected type action!");
34254
34255 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34256 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34257 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34259 SDValue Res;
34260 SDValue Chain;
34261 if (IsStrict) {
34262 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34263 {N->getOperand(0), Src});
34264 Chain = Res.getValue(1);
34265 } else
34266 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34267
34268 // Preserve what we know about the size of the original result. If the
34269 // result is v2i32, we have to manually widen the assert.
34270 if (PromoteVT == MVT::v2i32)
34271 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34272 DAG.getUNDEF(MVT::v2i32));
34273
34274 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34275 Res.getValueType(), Res,
34277
34278 if (PromoteVT == MVT::v2i32)
34279 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34280 DAG.getVectorIdxConstant(0, dl));
34281
34282 // Truncate back to the original width.
34283 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34284
34285 // Now widen to 128 bits.
34286 unsigned NumConcats = 128 / VT.getSizeInBits();
34288 VT.getVectorNumElements() * NumConcats);
34289 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34290 ConcatOps[0] = Res;
34291 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34292 Results.push_back(Res);
34293 if (IsStrict)
34294 Results.push_back(Chain);
34295 return;
34296 }
34297
34298
34299 if (VT == MVT::v2i32) {
34300 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34301 "Strict unsigned conversion requires AVX512");
34302 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34304 "Unexpected type action!");
34305 if (Src.getValueType() == MVT::v2f64) {
34306 if (!IsSigned && !Subtarget.hasAVX512()) {
34307 SDValue Res =
34308 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34309 Results.push_back(Res);
34310 return;
34311 }
34312
34313 if (IsStrict)
34315 else
34316 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34317
34318 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34319 if (!IsSigned && !Subtarget.hasVLX()) {
34320 // Otherwise we can defer to the generic legalizer which will widen
34321 // the input as well. This will be further widened during op
34322 // legalization to v8i32<-v8f64.
34323 // For strict nodes we'll need to widen ourselves.
34324 // FIXME: Fix the type legalizer to safely widen strict nodes?
34325 if (!IsStrict)
34326 return;
34327 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34328 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34329 Opc = N->getOpcode();
34330 }
34331 SDValue Res;
34332 SDValue Chain;
34333 if (IsStrict) {
34334 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34335 {N->getOperand(0), Src});
34336 Chain = Res.getValue(1);
34337 } else {
34338 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34339 }
34340 Results.push_back(Res);
34341 if (IsStrict)
34342 Results.push_back(Chain);
34343 return;
34344 }
34345
34346 // Custom widen strict v2f32->v2i32 by padding with zeros.
34347 // FIXME: Should generic type legalizer do this?
34348 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34349 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34350 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34351 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34352 {N->getOperand(0), Src});
34353 Results.push_back(Res);
34354 Results.push_back(Res.getValue(1));
34355 return;
34356 }
34357
34358 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34359 // so early out here.
34360 return;
34361 }
34362
34363 assert(!VT.isVector() && "Vectors should have been handled above!");
34364
34365 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34366 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34367 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34368 assert(!Subtarget.is64Bit() && "i64 should be legal");
34369 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34370 // If we use a 128-bit result we might need to use a target specific node.
34371 unsigned SrcElts =
34372 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34373 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34374 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34375 if (NumElts != SrcElts) {
34376 if (IsStrict)
34378 else
34379 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34380 }
34381
34382 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34383 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34384 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34385 ZeroIdx);
34386 SDValue Chain;
34387 if (IsStrict) {
34388 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34389 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34390 Chain = Res.getValue(1);
34391 } else
34392 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34393 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34394 Results.push_back(Res);
34395 if (IsStrict)
34396 Results.push_back(Chain);
34397 return;
34398 }
34399
34400 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34401 SDValue Chain;
34402 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34403 Results.push_back(V);
34404 if (IsStrict)
34405 Results.push_back(Chain);
34406 return;
34407 }
34408
34409 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34410 Results.push_back(V);
34411 if (IsStrict)
34412 Results.push_back(Chain);
34413 }
34414 return;
34415 }
34416 case ISD::LRINT:
34417 if (N->getValueType(0) == MVT::v2i32) {
34418 SDValue Src = N->getOperand(0);
34419 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34421 DAG.getUNDEF(MVT::v2f16));
34422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34423 DAG.getUNDEF(MVT::v4f16));
34424 } else if (Src.getValueType() != MVT::v2f64) {
34425 return;
34426 }
34427 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34428 return;
34429 }
34430 [[fallthrough]];
34431 case ISD::LLRINT: {
34432 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34433 Results.push_back(V);
34434 return;
34435 }
34436
34437 case ISD::SINT_TO_FP:
34439 case ISD::UINT_TO_FP:
34441 bool IsStrict = N->isStrictFPOpcode();
34442 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34443 EVT VT = N->getValueType(0);
34444 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34445 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34446 Subtarget.hasVLX()) {
34447 if (Src.getValueType().getVectorElementType() == MVT::i16)
34448 return;
34449
34450 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34451 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34452 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34453 : DAG.getUNDEF(MVT::v2i32));
34454 if (IsStrict) {
34455 unsigned Opc =
34457 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34458 {N->getOperand(0), Src});
34459 Results.push_back(Res);
34460 Results.push_back(Res.getValue(1));
34461 } else {
34462 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34463 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34464 }
34465 return;
34466 }
34467 if (VT != MVT::v2f32)
34468 return;
34469 EVT SrcVT = Src.getValueType();
34470 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34471 if (IsStrict) {
34472 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34474 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34475 {N->getOperand(0), Src});
34476 Results.push_back(Res);
34477 Results.push_back(Res.getValue(1));
34478 } else {
34479 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34480 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34481 }
34482 return;
34483 }
34484 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34485 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34486 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34487 SDValue One = DAG.getConstant(1, dl, SrcVT);
34488 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34489 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34490 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34491 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34492 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34493 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34494 for (int i = 0; i != 2; ++i) {
34495 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34496 SignSrc, DAG.getVectorIdxConstant(i, dl));
34497 if (IsStrict)
34498 SignCvts[i] =
34499 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34500 {N->getOperand(0), Elt});
34501 else
34502 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34503 };
34504 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34505 SDValue Slow, Chain;
34506 if (IsStrict) {
34507 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34508 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34509 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34510 {Chain, SignCvt, SignCvt});
34511 Chain = Slow.getValue(1);
34512 } else {
34513 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34514 }
34515 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34516 IsNeg =
34517 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34518 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34519 Results.push_back(Cvt);
34520 if (IsStrict)
34521 Results.push_back(Chain);
34522 return;
34523 }
34524
34525 if (SrcVT != MVT::v2i32)
34526 return;
34527
34528 if (IsSigned || Subtarget.hasAVX512()) {
34529 if (!IsStrict)
34530 return;
34531
34532 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34533 // FIXME: Should generic type legalizer do this?
34534 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34535 DAG.getConstant(0, dl, MVT::v2i32));
34536 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34537 {N->getOperand(0), Src});
34538 Results.push_back(Res);
34539 Results.push_back(Res.getValue(1));
34540 return;
34541 }
34542
34543 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34544 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34545 SDValue VBias = DAG.getConstantFP(
34546 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34547 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34548 DAG.getBitcast(MVT::v2i64, VBias));
34549 Or = DAG.getBitcast(MVT::v2f64, Or);
34550 if (IsStrict) {
34551 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34552 {N->getOperand(0), Or, VBias});
34554 {MVT::v4f32, MVT::Other},
34555 {Sub.getValue(1), Sub});
34556 Results.push_back(Res);
34557 Results.push_back(Res.getValue(1));
34558 } else {
34559 // TODO: Are there any fast-math-flags to propagate here?
34560 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34561 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34562 }
34563 return;
34564 }
34566 case ISD::FP_ROUND: {
34567 bool IsStrict = N->isStrictFPOpcode();
34568 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34569 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34570 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34571 EVT SrcVT = Src.getValueType();
34572 EVT VT = N->getValueType(0);
34573 SDValue V;
34574 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34575 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34576 : DAG.getUNDEF(MVT::v2f32);
34577 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34578 }
34579 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34580 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34581 if (SrcVT.getVectorElementType() != MVT::f32)
34582 return;
34583
34584 if (IsStrict)
34585 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34586 {Chain, Src, Rnd});
34587 else
34588 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34589
34590 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34591 if (IsStrict)
34592 Results.push_back(V.getValue(1));
34593 return;
34594 }
34595 if (!isTypeLegal(Src.getValueType()))
34596 return;
34597 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34598 if (IsStrict)
34599 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34600 {Chain, Src});
34601 else
34602 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34603 Results.push_back(V);
34604 if (IsStrict)
34605 Results.push_back(V.getValue(1));
34606 return;
34607 }
34608 case ISD::FP_EXTEND:
34609 case ISD::STRICT_FP_EXTEND: {
34610 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34611 // No other ValueType for FP_EXTEND should reach this point.
34612 assert(N->getValueType(0) == MVT::v2f32 &&
34613 "Do not know how to legalize this Node");
34614 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34615 return;
34616 bool IsStrict = N->isStrictFPOpcode();
34617 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34618 if (Src.getValueType().getVectorElementType() != MVT::f16)
34619 return;
34620 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34621 : DAG.getUNDEF(MVT::v2f16);
34622 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34623 if (IsStrict)
34624 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34625 {N->getOperand(0), V});
34626 else
34627 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34628 Results.push_back(V);
34629 if (IsStrict)
34630 Results.push_back(V.getValue(1));
34631 return;
34632 }
34634 unsigned IntNo = N->getConstantOperandVal(1);
34635 switch (IntNo) {
34636 default : llvm_unreachable("Do not know how to custom type "
34637 "legalize this intrinsic operation!");
34638 case Intrinsic::x86_rdtsc:
34639 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34640 Results);
34641 case Intrinsic::x86_rdtscp:
34642 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34643 Results);
34644 case Intrinsic::x86_rdpmc:
34645 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34646 Results);
34647 return;
34648 case Intrinsic::x86_rdpru:
34649 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34650 Results);
34651 return;
34652 case Intrinsic::x86_xgetbv:
34653 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34654 Results);
34655 return;
34656 }
34657 }
34658 case ISD::READCYCLECOUNTER: {
34659 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34660 }
34661 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34662 EVT T = N->getValueType(0);
34663 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34664 bool Regs64bit = T == MVT::i128;
34665 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34666 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34667 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34668 SDValue cpInL, cpInH;
34669 std::tie(cpInL, cpInH) =
34670 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34671 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34672 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34673 cpInH =
34674 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34675 cpInH, cpInL.getValue(1));
34676 SDValue swapInL, swapInH;
34677 std::tie(swapInL, swapInH) =
34678 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34679 swapInH =
34680 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34681 swapInH, cpInH.getValue(1));
34682
34683 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34684 // until later. So we keep the RBX input in a vreg and use a custom
34685 // inserter.
34686 // Since RBX will be a reserved register the register allocator will not
34687 // make sure its value will be properly saved and restored around this
34688 // live-range.
34689 SDValue Result;
34690 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34691 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34692 if (Regs64bit) {
34693 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34694 swapInH.getValue(1)};
34695 Result =
34696 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34697 } else {
34698 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34699 swapInH.getValue(1));
34700 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34701 swapInL.getValue(1)};
34702 Result =
34703 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34704 }
34705
34706 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34707 Regs64bit ? X86::RAX : X86::EAX,
34708 HalfT, Result.getValue(1));
34709 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34710 Regs64bit ? X86::RDX : X86::EDX,
34711 HalfT, cpOutL.getValue(2));
34712 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34713
34714 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34715 MVT::i32, cpOutH.getValue(2));
34716 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34717 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34718
34719 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34720 Results.push_back(Success);
34721 Results.push_back(EFLAGS.getValue(1));
34722 return;
34723 }
34724 case ISD::ATOMIC_LOAD: {
34725 assert(
34726 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34727 "Unexpected VT!");
34728 bool NoImplicitFloatOps =
34730 Attribute::NoImplicitFloat);
34731 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34732 auto *Node = cast<AtomicSDNode>(N);
34733
34734 if (N->getValueType(0) == MVT::i128) {
34735 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34736 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34737 Node->getBasePtr(), Node->getMemOperand());
34738 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34739 DAG.getVectorIdxConstant(0, dl));
34740 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34741 DAG.getVectorIdxConstant(1, dl));
34742 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34743 {ResL, ResH}));
34744 Results.push_back(Ld.getValue(1));
34745 return;
34746 }
34747 break;
34748 }
34749 if (Subtarget.hasSSE1()) {
34750 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34751 // Then extract the lower 64-bits.
34752 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34753 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34754 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34756 MVT::i64, Node->getMemOperand());
34757 if (Subtarget.hasSSE2()) {
34758 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34759 DAG.getVectorIdxConstant(0, dl));
34760 Results.push_back(Res);
34761 Results.push_back(Ld.getValue(1));
34762 return;
34763 }
34764 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34765 // then casts to i64. This avoids a 128-bit stack temporary being
34766 // created by type legalization if we were to cast v4f32->v2i64.
34767 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34768 DAG.getVectorIdxConstant(0, dl));
34769 Res = DAG.getBitcast(MVT::i64, Res);
34770 Results.push_back(Res);
34771 Results.push_back(Ld.getValue(1));
34772 return;
34773 }
34774 if (Subtarget.hasX87()) {
34775 // First load this into an 80-bit X87 register. This will put the whole
34776 // integer into the significand.
34777 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34778 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34780 dl, Tys, Ops, MVT::i64,
34781 Node->getMemOperand());
34782 SDValue Chain = Result.getValue(1);
34783
34784 // Now store the X87 register to a stack temporary and convert to i64.
34785 // This store is not atomic and doesn't need to be.
34786 // FIXME: We don't need a stack temporary if the result of the load
34787 // is already being stored. We could just directly store there.
34788 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34789 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34790 MachinePointerInfo MPI =
34792 SDValue StoreOps[] = { Chain, Result, StackPtr };
34793 Chain = DAG.getMemIntrinsicNode(
34794 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34795 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34796
34797 // Finally load the value back from the stack temporary and return it.
34798 // This load is not atomic and doesn't need to be.
34799 // This load will be further type legalized.
34800 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34801 Results.push_back(Result);
34802 Results.push_back(Result.getValue(1));
34803 return;
34804 }
34805 }
34806 // TODO: Use MOVLPS when SSE1 is available?
34807 // Delegate to generic TypeLegalization. Situations we can really handle
34808 // should have already been dealt with by AtomicExpandPass.cpp.
34809 break;
34810 }
34811 case ISD::ATOMIC_SWAP:
34812 case ISD::ATOMIC_LOAD_ADD:
34813 case ISD::ATOMIC_LOAD_SUB:
34814 case ISD::ATOMIC_LOAD_AND:
34815 case ISD::ATOMIC_LOAD_OR:
34816 case ISD::ATOMIC_LOAD_XOR:
34817 case ISD::ATOMIC_LOAD_NAND:
34818 case ISD::ATOMIC_LOAD_MIN:
34819 case ISD::ATOMIC_LOAD_MAX:
34820 case ISD::ATOMIC_LOAD_UMIN:
34821 case ISD::ATOMIC_LOAD_UMAX:
34822 // Delegate to generic TypeLegalization. Situations we can really handle
34823 // should have already been dealt with by AtomicExpandPass.cpp.
34824 break;
34825
34826 case ISD::BITCAST: {
34827 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34828 EVT DstVT = N->getValueType(0);
34829 EVT SrcVT = N->getOperand(0).getValueType();
34830
34831 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34832 // we can split using the k-register rather than memory.
34833 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34834 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34835 SDValue Lo, Hi;
34836 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34837 Lo = DAG.getBitcast(MVT::i32, Lo);
34838 Hi = DAG.getBitcast(MVT::i32, Hi);
34839 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34840 Results.push_back(Res);
34841 return;
34842 }
34843
34844 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34845 // FIXME: Use v4f32 for SSE1?
34846 assert(Subtarget.hasSSE2() && "Requires SSE2");
34847 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34848 "Unexpected type action!");
34849 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34850 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34851 N->getOperand(0));
34852 Res = DAG.getBitcast(WideVT, Res);
34853 Results.push_back(Res);
34854 return;
34855 }
34856
34857 return;
34858 }
34859 case ISD::MGATHER: {
34860 EVT VT = N->getValueType(0);
34861 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34862 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34863 auto *Gather = cast<MaskedGatherSDNode>(N);
34864 SDValue Index = Gather->getIndex();
34865 if (Index.getValueType() != MVT::v2i64)
34866 return;
34868 "Unexpected type action!");
34869 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34870 SDValue Mask = Gather->getMask();
34871 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34872 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34873 Gather->getPassThru(),
34874 DAG.getUNDEF(VT));
34875 if (!Subtarget.hasVLX()) {
34876 // We need to widen the mask, but the instruction will only use 2
34877 // of its elements. So we can use undef.
34878 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34879 DAG.getUNDEF(MVT::v2i1));
34880 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34881 }
34882 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34883 Gather->getBasePtr(), Index, Gather->getScale() };
34884 SDValue Res = DAG.getMemIntrinsicNode(
34885 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34886 Gather->getMemoryVT(), Gather->getMemOperand());
34887 Results.push_back(Res);
34888 Results.push_back(Res.getValue(1));
34889 return;
34890 }
34891 return;
34892 }
34893 case ISD::LOAD: {
34894 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34895 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34896 // cast since type legalization will try to use an i64 load.
34897 MVT VT = N->getSimpleValueType(0);
34898 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34900 "Unexpected type action!");
34901 if (!ISD::isNON_EXTLoad(N))
34902 return;
34903 auto *Ld = cast<LoadSDNode>(N);
34904 if (Subtarget.hasSSE2()) {
34905 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34906 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34907 Ld->getPointerInfo(), Ld->getBaseAlign(),
34908 Ld->getMemOperand()->getFlags());
34909 SDValue Chain = Res.getValue(1);
34910 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34911 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34912 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34913 Res = DAG.getBitcast(WideVT, Res);
34914 Results.push_back(Res);
34915 Results.push_back(Chain);
34916 return;
34917 }
34918 assert(Subtarget.hasSSE1() && "Expected SSE");
34919 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34920 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34922 MVT::i64, Ld->getMemOperand());
34923 Results.push_back(Res);
34924 Results.push_back(Res.getValue(1));
34925 return;
34926 }
34927 case ISD::ADDRSPACECAST: {
34928 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34929 Results.push_back(V);
34930 return;
34931 }
34932 case ISD::BITREVERSE: {
34933 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34934 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34935 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34936 // We'll need to move the scalar in two i32 pieces.
34937 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34938 return;
34939 }
34941 // f16 = extract vXf16 %vec, i64 %idx
34942 assert(N->getSimpleValueType(0) == MVT::f16 &&
34943 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34944 assert(Subtarget.hasFP16() && "Expected FP16");
34945 SDValue VecOp = N->getOperand(0);
34947 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34948 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34949 N->getOperand(1));
34950 Split = DAG.getBitcast(MVT::f16, Split);
34951 Results.push_back(Split);
34952 return;
34953 }
34954 }
34955}
34956
34957const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34958 switch ((X86ISD::NodeType)Opcode) {
34959 case X86ISD::FIRST_NUMBER: break;
34960#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34961 NODE_NAME_CASE(BSF)
34962 NODE_NAME_CASE(BSR)
34963 NODE_NAME_CASE(FSHL)
34964 NODE_NAME_CASE(FSHR)
34965 NODE_NAME_CASE(FAND)
34966 NODE_NAME_CASE(FANDN)
34967 NODE_NAME_CASE(FOR)
34968 NODE_NAME_CASE(FXOR)
34969 NODE_NAME_CASE(FILD)
34970 NODE_NAME_CASE(FIST)
34971 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34972 NODE_NAME_CASE(FLD)
34973 NODE_NAME_CASE(FST)
34974 NODE_NAME_CASE(CALL)
34975 NODE_NAME_CASE(CALL_RVMARKER)
34976 NODE_NAME_CASE(IMP_CALL)
34978 NODE_NAME_CASE(CMP)
34979 NODE_NAME_CASE(FCMP)
34980 NODE_NAME_CASE(STRICT_FCMP)
34981 NODE_NAME_CASE(STRICT_FCMPS)
34983 NODE_NAME_CASE(UCOMI)
34984 NODE_NAME_CASE(COMX)
34985 NODE_NAME_CASE(UCOMX)
34986 NODE_NAME_CASE(CMPM)
34987 NODE_NAME_CASE(CMPMM)
34988 NODE_NAME_CASE(STRICT_CMPM)
34989 NODE_NAME_CASE(CMPMM_SAE)
34990 NODE_NAME_CASE(SETCC)
34991 NODE_NAME_CASE(SETCC_CARRY)
34992 NODE_NAME_CASE(FSETCC)
34993 NODE_NAME_CASE(FSETCCM)
34994 NODE_NAME_CASE(FSETCCM_SAE)
34995 NODE_NAME_CASE(CMOV)
34996 NODE_NAME_CASE(BRCOND)
34997 NODE_NAME_CASE(RET_GLUE)
34998 NODE_NAME_CASE(IRET)
34999 NODE_NAME_CASE(REP_STOS)
35000 NODE_NAME_CASE(REP_MOVS)
35001 NODE_NAME_CASE(GlobalBaseReg)
35003 NODE_NAME_CASE(WrapperRIP)
35004 NODE_NAME_CASE(MOVQ2DQ)
35005 NODE_NAME_CASE(MOVDQ2Q)
35006 NODE_NAME_CASE(MMX_MOVD2W)
35007 NODE_NAME_CASE(MMX_MOVW2D)
35008 NODE_NAME_CASE(PEXTRB)
35009 NODE_NAME_CASE(PEXTRW)
35010 NODE_NAME_CASE(INSERTPS)
35011 NODE_NAME_CASE(PINSRB)
35012 NODE_NAME_CASE(PINSRW)
35013 NODE_NAME_CASE(PSHUFB)
35014 NODE_NAME_CASE(ANDNP)
35015 NODE_NAME_CASE(BLENDI)
35017 NODE_NAME_CASE(HADD)
35018 NODE_NAME_CASE(HSUB)
35019 NODE_NAME_CASE(FHADD)
35020 NODE_NAME_CASE(FHSUB)
35021 NODE_NAME_CASE(CONFLICT)
35022 NODE_NAME_CASE(FMAX)
35023 NODE_NAME_CASE(FMAXS)
35024 NODE_NAME_CASE(FMAX_SAE)
35025 NODE_NAME_CASE(FMAXS_SAE)
35026 NODE_NAME_CASE(STRICT_FMAX)
35027 NODE_NAME_CASE(FMIN)
35028 NODE_NAME_CASE(FMINS)
35029 NODE_NAME_CASE(FMIN_SAE)
35030 NODE_NAME_CASE(FMINS_SAE)
35031 NODE_NAME_CASE(STRICT_FMIN)
35032 NODE_NAME_CASE(FMAXC)
35033 NODE_NAME_CASE(FMINC)
35034 NODE_NAME_CASE(FRSQRT)
35035 NODE_NAME_CASE(FRCP)
35036 NODE_NAME_CASE(EXTRQI)
35037 NODE_NAME_CASE(INSERTQI)
35038 NODE_NAME_CASE(TLSADDR)
35039 NODE_NAME_CASE(TLSBASEADDR)
35040 NODE_NAME_CASE(TLSCALL)
35041 NODE_NAME_CASE(TLSDESC)
35042 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35043 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35044 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35045 NODE_NAME_CASE(EH_RETURN)
35046 NODE_NAME_CASE(TC_RETURN)
35047 NODE_NAME_CASE(FNSTCW16m)
35048 NODE_NAME_CASE(FLDCW16m)
35049 NODE_NAME_CASE(FNSTENVm)
35050 NODE_NAME_CASE(FLDENVm)
35051 NODE_NAME_CASE(LCMPXCHG_DAG)
35052 NODE_NAME_CASE(LCMPXCHG8_DAG)
35053 NODE_NAME_CASE(LCMPXCHG16_DAG)
35054 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35055 NODE_NAME_CASE(LADD)
35056 NODE_NAME_CASE(LSUB)
35057 NODE_NAME_CASE(LOR)
35058 NODE_NAME_CASE(LXOR)
35059 NODE_NAME_CASE(LAND)
35060 NODE_NAME_CASE(LBTS)
35061 NODE_NAME_CASE(LBTC)
35062 NODE_NAME_CASE(LBTR)
35063 NODE_NAME_CASE(LBTS_RM)
35064 NODE_NAME_CASE(LBTC_RM)
35065 NODE_NAME_CASE(LBTR_RM)
35066 NODE_NAME_CASE(AADD)
35067 NODE_NAME_CASE(AOR)
35068 NODE_NAME_CASE(AXOR)
35069 NODE_NAME_CASE(AAND)
35070 NODE_NAME_CASE(VZEXT_MOVL)
35071 NODE_NAME_CASE(VZEXT_LOAD)
35072 NODE_NAME_CASE(VEXTRACT_STORE)
35073 NODE_NAME_CASE(VTRUNC)
35074 NODE_NAME_CASE(VTRUNCS)
35075 NODE_NAME_CASE(VTRUNCUS)
35076 NODE_NAME_CASE(VMTRUNC)
35077 NODE_NAME_CASE(VMTRUNCS)
35078 NODE_NAME_CASE(VMTRUNCUS)
35079 NODE_NAME_CASE(VTRUNCSTORES)
35080 NODE_NAME_CASE(VTRUNCSTOREUS)
35081 NODE_NAME_CASE(VMTRUNCSTORES)
35082 NODE_NAME_CASE(VMTRUNCSTOREUS)
35083 NODE_NAME_CASE(VFPEXT)
35084 NODE_NAME_CASE(STRICT_VFPEXT)
35085 NODE_NAME_CASE(VFPEXT_SAE)
35086 NODE_NAME_CASE(VFPEXTS)
35087 NODE_NAME_CASE(VFPEXTS_SAE)
35088 NODE_NAME_CASE(VFPROUND)
35089 NODE_NAME_CASE(VFPROUND2)
35090 NODE_NAME_CASE(VFPROUND2_RND)
35091 NODE_NAME_CASE(STRICT_VFPROUND)
35092 NODE_NAME_CASE(VMFPROUND)
35093 NODE_NAME_CASE(VFPROUND_RND)
35094 NODE_NAME_CASE(VFPROUNDS)
35095 NODE_NAME_CASE(VFPROUNDS_RND)
35096 NODE_NAME_CASE(VSHLDQ)
35097 NODE_NAME_CASE(VSRLDQ)
35098 NODE_NAME_CASE(VSHL)
35099 NODE_NAME_CASE(VSRL)
35100 NODE_NAME_CASE(VSRA)
35101 NODE_NAME_CASE(VSHLI)
35102 NODE_NAME_CASE(VSRLI)
35103 NODE_NAME_CASE(VSRAI)
35104 NODE_NAME_CASE(VSHLV)
35105 NODE_NAME_CASE(VSRLV)
35106 NODE_NAME_CASE(VSRAV)
35107 NODE_NAME_CASE(VROTLI)
35108 NODE_NAME_CASE(VROTRI)
35109 NODE_NAME_CASE(VPPERM)
35110 NODE_NAME_CASE(CMPP)
35111 NODE_NAME_CASE(STRICT_CMPP)
35112 NODE_NAME_CASE(PCMPEQ)
35113 NODE_NAME_CASE(PCMPGT)
35114 NODE_NAME_CASE(PHMINPOS)
35115 NODE_NAME_CASE(ADD)
35116 NODE_NAME_CASE(SUB)
35117 NODE_NAME_CASE(ADC)
35118 NODE_NAME_CASE(SBB)
35119 NODE_NAME_CASE(SMUL)
35120 NODE_NAME_CASE(UMUL)
35121 NODE_NAME_CASE(OR)
35122 NODE_NAME_CASE(XOR)
35123 NODE_NAME_CASE(AND)
35124 NODE_NAME_CASE(BEXTR)
35126 NODE_NAME_CASE(BZHI)
35127 NODE_NAME_CASE(PDEP)
35128 NODE_NAME_CASE(PEXT)
35129 NODE_NAME_CASE(MUL_IMM)
35130 NODE_NAME_CASE(MOVMSK)
35131 NODE_NAME_CASE(PTEST)
35132 NODE_NAME_CASE(TESTP)
35133 NODE_NAME_CASE(KORTEST)
35134 NODE_NAME_CASE(KTEST)
35135 NODE_NAME_CASE(KADD)
35136 NODE_NAME_CASE(KSHIFTL)
35137 NODE_NAME_CASE(KSHIFTR)
35138 NODE_NAME_CASE(PACKSS)
35139 NODE_NAME_CASE(PACKUS)
35140 NODE_NAME_CASE(PALIGNR)
35141 NODE_NAME_CASE(VALIGN)
35142 NODE_NAME_CASE(VSHLD)
35143 NODE_NAME_CASE(VSHRD)
35144 NODE_NAME_CASE(PSHUFD)
35145 NODE_NAME_CASE(PSHUFHW)
35146 NODE_NAME_CASE(PSHUFLW)
35147 NODE_NAME_CASE(SHUFP)
35148 NODE_NAME_CASE(SHUF128)
35149 NODE_NAME_CASE(MOVLHPS)
35150 NODE_NAME_CASE(MOVHLPS)
35151 NODE_NAME_CASE(MOVDDUP)
35152 NODE_NAME_CASE(MOVSHDUP)
35153 NODE_NAME_CASE(MOVSLDUP)
35154 NODE_NAME_CASE(MOVSD)
35155 NODE_NAME_CASE(MOVSS)
35156 NODE_NAME_CASE(MOVSH)
35157 NODE_NAME_CASE(UNPCKL)
35158 NODE_NAME_CASE(UNPCKH)
35159 NODE_NAME_CASE(VBROADCAST)
35160 NODE_NAME_CASE(VBROADCAST_LOAD)
35161 NODE_NAME_CASE(VBROADCASTM)
35162 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35163 NODE_NAME_CASE(VPERMILPV)
35164 NODE_NAME_CASE(VPERMILPI)
35165 NODE_NAME_CASE(VPERM2X128)
35166 NODE_NAME_CASE(VPERMV)
35167 NODE_NAME_CASE(VPERMV3)
35168 NODE_NAME_CASE(VPERMI)
35169 NODE_NAME_CASE(VPTERNLOG)
35170 NODE_NAME_CASE(FP_TO_SINT_SAT)
35171 NODE_NAME_CASE(FP_TO_UINT_SAT)
35172 NODE_NAME_CASE(VFIXUPIMM)
35173 NODE_NAME_CASE(VFIXUPIMM_SAE)
35174 NODE_NAME_CASE(VFIXUPIMMS)
35175 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35176 NODE_NAME_CASE(VRANGE)
35177 NODE_NAME_CASE(VRANGE_SAE)
35178 NODE_NAME_CASE(VRANGES)
35179 NODE_NAME_CASE(VRANGES_SAE)
35180 NODE_NAME_CASE(PMULUDQ)
35181 NODE_NAME_CASE(PMULDQ)
35182 NODE_NAME_CASE(PSADBW)
35183 NODE_NAME_CASE(DBPSADBW)
35184 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35185 NODE_NAME_CASE(VAARG_64)
35186 NODE_NAME_CASE(VAARG_X32)
35187 NODE_NAME_CASE(DYN_ALLOCA)
35188 NODE_NAME_CASE(MFENCE)
35189 NODE_NAME_CASE(SEG_ALLOCA)
35190 NODE_NAME_CASE(PROBED_ALLOCA)
35193 NODE_NAME_CASE(RDPKRU)
35194 NODE_NAME_CASE(WRPKRU)
35195 NODE_NAME_CASE(VPMADDUBSW)
35196 NODE_NAME_CASE(VPMADDWD)
35197 NODE_NAME_CASE(VPSHA)
35198 NODE_NAME_CASE(VPSHL)
35199 NODE_NAME_CASE(VPCOM)
35200 NODE_NAME_CASE(VPCOMU)
35201 NODE_NAME_CASE(VPERMIL2)
35203 NODE_NAME_CASE(STRICT_FMSUB)
35205 NODE_NAME_CASE(STRICT_FNMADD)
35207 NODE_NAME_CASE(STRICT_FNMSUB)
35208 NODE_NAME_CASE(FMADDSUB)
35209 NODE_NAME_CASE(FMSUBADD)
35210 NODE_NAME_CASE(FMADD_RND)
35211 NODE_NAME_CASE(FNMADD_RND)
35212 NODE_NAME_CASE(FMSUB_RND)
35213 NODE_NAME_CASE(FNMSUB_RND)
35214 NODE_NAME_CASE(FMADDSUB_RND)
35215 NODE_NAME_CASE(FMSUBADD_RND)
35216 NODE_NAME_CASE(VFMADDC)
35217 NODE_NAME_CASE(VFMADDC_RND)
35218 NODE_NAME_CASE(VFCMADDC)
35219 NODE_NAME_CASE(VFCMADDC_RND)
35220 NODE_NAME_CASE(VFMULC)
35221 NODE_NAME_CASE(VFMULC_RND)
35222 NODE_NAME_CASE(VFCMULC)
35223 NODE_NAME_CASE(VFCMULC_RND)
35224 NODE_NAME_CASE(VFMULCSH)
35225 NODE_NAME_CASE(VFMULCSH_RND)
35226 NODE_NAME_CASE(VFCMULCSH)
35227 NODE_NAME_CASE(VFCMULCSH_RND)
35228 NODE_NAME_CASE(VFMADDCSH)
35229 NODE_NAME_CASE(VFMADDCSH_RND)
35230 NODE_NAME_CASE(VFCMADDCSH)
35231 NODE_NAME_CASE(VFCMADDCSH_RND)
35232 NODE_NAME_CASE(VPMADD52H)
35233 NODE_NAME_CASE(VPMADD52L)
35234 NODE_NAME_CASE(VRNDSCALE)
35235 NODE_NAME_CASE(STRICT_VRNDSCALE)
35236 NODE_NAME_CASE(VRNDSCALE_SAE)
35237 NODE_NAME_CASE(VRNDSCALES)
35238 NODE_NAME_CASE(VRNDSCALES_SAE)
35239 NODE_NAME_CASE(VREDUCE)
35240 NODE_NAME_CASE(VREDUCE_SAE)
35241 NODE_NAME_CASE(VREDUCES)
35242 NODE_NAME_CASE(VREDUCES_SAE)
35243 NODE_NAME_CASE(VGETMANT)
35244 NODE_NAME_CASE(VGETMANT_SAE)
35245 NODE_NAME_CASE(VGETMANTS)
35246 NODE_NAME_CASE(VGETMANTS_SAE)
35247 NODE_NAME_CASE(PCMPESTR)
35248 NODE_NAME_CASE(PCMPISTR)
35250 NODE_NAME_CASE(COMPRESS)
35252 NODE_NAME_CASE(SELECTS)
35253 NODE_NAME_CASE(ADDSUB)
35254 NODE_NAME_CASE(RCP14)
35255 NODE_NAME_CASE(RCP14S)
35256 NODE_NAME_CASE(RSQRT14)
35257 NODE_NAME_CASE(RSQRT14S)
35258 NODE_NAME_CASE(FADD_RND)
35259 NODE_NAME_CASE(FADDS)
35260 NODE_NAME_CASE(FADDS_RND)
35261 NODE_NAME_CASE(FSUB_RND)
35262 NODE_NAME_CASE(FSUBS)
35263 NODE_NAME_CASE(FSUBS_RND)
35264 NODE_NAME_CASE(FMUL_RND)
35265 NODE_NAME_CASE(FMULS)
35266 NODE_NAME_CASE(FMULS_RND)
35267 NODE_NAME_CASE(FDIV_RND)
35268 NODE_NAME_CASE(FDIVS)
35269 NODE_NAME_CASE(FDIVS_RND)
35270 NODE_NAME_CASE(FSQRT_RND)
35271 NODE_NAME_CASE(FSQRTS)
35272 NODE_NAME_CASE(FSQRTS_RND)
35273 NODE_NAME_CASE(FGETEXP)
35274 NODE_NAME_CASE(FGETEXP_SAE)
35275 NODE_NAME_CASE(FGETEXPS)
35276 NODE_NAME_CASE(FGETEXPS_SAE)
35277 NODE_NAME_CASE(SCALEF)
35278 NODE_NAME_CASE(SCALEF_RND)
35279 NODE_NAME_CASE(SCALEFS)
35280 NODE_NAME_CASE(SCALEFS_RND)
35281 NODE_NAME_CASE(MULHRS)
35282 NODE_NAME_CASE(SINT_TO_FP_RND)
35283 NODE_NAME_CASE(UINT_TO_FP_RND)
35284 NODE_NAME_CASE(CVTTP2SI)
35285 NODE_NAME_CASE(CVTTP2UI)
35286 NODE_NAME_CASE(STRICT_CVTTP2SI)
35287 NODE_NAME_CASE(STRICT_CVTTP2UI)
35288 NODE_NAME_CASE(MCVTTP2SI)
35289 NODE_NAME_CASE(MCVTTP2UI)
35290 NODE_NAME_CASE(CVTTP2SI_SAE)
35291 NODE_NAME_CASE(CVTTP2UI_SAE)
35292 NODE_NAME_CASE(CVTTS2SI)
35293 NODE_NAME_CASE(CVTTS2UI)
35294 NODE_NAME_CASE(CVTTS2SI_SAE)
35295 NODE_NAME_CASE(CVTTS2UI_SAE)
35296 NODE_NAME_CASE(CVTSI2P)
35297 NODE_NAME_CASE(CVTUI2P)
35298 NODE_NAME_CASE(STRICT_CVTSI2P)
35299 NODE_NAME_CASE(STRICT_CVTUI2P)
35300 NODE_NAME_CASE(MCVTSI2P)
35301 NODE_NAME_CASE(MCVTUI2P)
35302 NODE_NAME_CASE(VFPCLASS)
35303 NODE_NAME_CASE(VFPCLASSS)
35304 NODE_NAME_CASE(MULTISHIFT)
35305 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35306 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35307 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35308 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35309 NODE_NAME_CASE(CVTPS2PH)
35310 NODE_NAME_CASE(STRICT_CVTPS2PH)
35311 NODE_NAME_CASE(CVTPS2PH_SAE)
35312 NODE_NAME_CASE(MCVTPS2PH)
35313 NODE_NAME_CASE(MCVTPS2PH_SAE)
35314 NODE_NAME_CASE(CVTPH2PS)
35315 NODE_NAME_CASE(STRICT_CVTPH2PS)
35316 NODE_NAME_CASE(CVTPH2PS_SAE)
35317 NODE_NAME_CASE(CVTP2SI)
35318 NODE_NAME_CASE(CVTP2UI)
35319 NODE_NAME_CASE(MCVTP2SI)
35320 NODE_NAME_CASE(MCVTP2UI)
35321 NODE_NAME_CASE(CVTP2SI_RND)
35322 NODE_NAME_CASE(CVTP2UI_RND)
35323 NODE_NAME_CASE(CVTS2SI)
35324 NODE_NAME_CASE(CVTS2UI)
35325 NODE_NAME_CASE(CVTS2SI_RND)
35326 NODE_NAME_CASE(CVTS2UI_RND)
35327 NODE_NAME_CASE(CVTNEPS2BF16)
35328 NODE_NAME_CASE(MCVTNEPS2BF16)
35329 NODE_NAME_CASE(DPBF16PS)
35330 NODE_NAME_CASE(DPFP16PS)
35331 NODE_NAME_CASE(MPSADBW)
35332 NODE_NAME_CASE(LWPINS)
35333 NODE_NAME_CASE(MGATHER)
35334 NODE_NAME_CASE(MSCATTER)
35335 NODE_NAME_CASE(VPDPBUSD)
35336 NODE_NAME_CASE(VPDPBUSDS)
35337 NODE_NAME_CASE(VPDPWSSD)
35338 NODE_NAME_CASE(VPDPWSSDS)
35339 NODE_NAME_CASE(VPSHUFBITQMB)
35340 NODE_NAME_CASE(GF2P8MULB)
35341 NODE_NAME_CASE(GF2P8AFFINEQB)
35342 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35343 NODE_NAME_CASE(NT_CALL)
35344 NODE_NAME_CASE(NT_BRIND)
35345 NODE_NAME_CASE(UMWAIT)
35346 NODE_NAME_CASE(TPAUSE)
35347 NODE_NAME_CASE(ENQCMD)
35348 NODE_NAME_CASE(ENQCMDS)
35349 NODE_NAME_CASE(VP2INTERSECT)
35350 NODE_NAME_CASE(VPDPBSUD)
35351 NODE_NAME_CASE(VPDPBSUDS)
35352 NODE_NAME_CASE(VPDPBUUD)
35353 NODE_NAME_CASE(VPDPBUUDS)
35354 NODE_NAME_CASE(VPDPBSSD)
35355 NODE_NAME_CASE(VPDPBSSDS)
35356 NODE_NAME_CASE(VPDPWSUD)
35357 NODE_NAME_CASE(VPDPWSUDS)
35358 NODE_NAME_CASE(VPDPWUSD)
35359 NODE_NAME_CASE(VPDPWUSDS)
35360 NODE_NAME_CASE(VPDPWUUD)
35361 NODE_NAME_CASE(VPDPWUUDS)
35362 NODE_NAME_CASE(VMINMAX)
35363 NODE_NAME_CASE(VMINMAX_SAE)
35364 NODE_NAME_CASE(VMINMAXS)
35365 NODE_NAME_CASE(VMINMAXS_SAE)
35366 NODE_NAME_CASE(CVTP2IBS)
35367 NODE_NAME_CASE(CVTP2IUBS)
35368 NODE_NAME_CASE(CVTP2IBS_RND)
35369 NODE_NAME_CASE(CVTP2IUBS_RND)
35370 NODE_NAME_CASE(CVTTP2IBS)
35371 NODE_NAME_CASE(CVTTP2IUBS)
35372 NODE_NAME_CASE(CVTTP2IBS_SAE)
35373 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35374 NODE_NAME_CASE(VCVT2PH2BF8)
35375 NODE_NAME_CASE(VCVT2PH2BF8S)
35376 NODE_NAME_CASE(VCVT2PH2HF8)
35377 NODE_NAME_CASE(VCVT2PH2HF8S)
35378 NODE_NAME_CASE(VCVTBIASPH2BF8)
35379 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35380 NODE_NAME_CASE(VCVTBIASPH2HF8)
35381 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35382 NODE_NAME_CASE(VCVTPH2BF8)
35383 NODE_NAME_CASE(VCVTPH2BF8S)
35384 NODE_NAME_CASE(VCVTPH2HF8)
35385 NODE_NAME_CASE(VCVTPH2HF8S)
35386 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35387 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35388 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35389 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35390 NODE_NAME_CASE(VMCVTPH2BF8)
35391 NODE_NAME_CASE(VMCVTPH2BF8S)
35392 NODE_NAME_CASE(VMCVTPH2HF8)
35393 NODE_NAME_CASE(VMCVTPH2HF8S)
35394 NODE_NAME_CASE(VCVTHF82PH)
35395 NODE_NAME_CASE(AESENC128KL)
35396 NODE_NAME_CASE(AESDEC128KL)
35397 NODE_NAME_CASE(AESENC256KL)
35398 NODE_NAME_CASE(AESDEC256KL)
35399 NODE_NAME_CASE(AESENCWIDE128KL)
35400 NODE_NAME_CASE(AESDECWIDE128KL)
35401 NODE_NAME_CASE(AESENCWIDE256KL)
35402 NODE_NAME_CASE(AESDECWIDE256KL)
35403 NODE_NAME_CASE(CMPCCXADD)
35404 NODE_NAME_CASE(TESTUI)
35405 NODE_NAME_CASE(FP80_ADD)
35406 NODE_NAME_CASE(STRICT_FP80_ADD)
35407 NODE_NAME_CASE(CCMP)
35408 NODE_NAME_CASE(CTEST)
35409 NODE_NAME_CASE(CLOAD)
35410 NODE_NAME_CASE(CSTORE)
35411 NODE_NAME_CASE(CVTTS2SIS)
35412 NODE_NAME_CASE(CVTTS2UIS)
35413 NODE_NAME_CASE(CVTTS2SIS_SAE)
35414 NODE_NAME_CASE(CVTTS2UIS_SAE)
35415 NODE_NAME_CASE(CVTTP2SIS)
35416 NODE_NAME_CASE(MCVTTP2SIS)
35417 NODE_NAME_CASE(CVTTP2UIS_SAE)
35418 NODE_NAME_CASE(CVTTP2SIS_SAE)
35419 NODE_NAME_CASE(CVTTP2UIS)
35420 NODE_NAME_CASE(MCVTTP2UIS)
35421 NODE_NAME_CASE(POP_FROM_X87_REG)
35422 }
35423 return nullptr;
35424#undef NODE_NAME_CASE
35425}
35426
35427/// Return true if the addressing mode represented by AM is legal for this
35428/// target, for a load/store of the specified type.
35430 const AddrMode &AM, Type *Ty,
35431 unsigned AS,
35432 Instruction *I) const {
35433 // X86 supports extremely general addressing modes.
35435
35436 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35437 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35438 return false;
35439
35440 if (AM.BaseGV) {
35441 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35442
35443 // If a reference to this global requires an extra load, we can't fold it.
35444 if (isGlobalStubReference(GVFlags))
35445 return false;
35446
35447 // If BaseGV requires a register for the PIC base, we cannot also have a
35448 // BaseReg specified.
35449 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35450 return false;
35451
35452 // If lower 4G is not available, then we must use rip-relative addressing.
35453 if ((M != CodeModel::Small || isPositionIndependent()) &&
35454 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35455 return false;
35456 }
35457
35458 switch (AM.Scale) {
35459 case 0:
35460 case 1:
35461 case 2:
35462 case 4:
35463 case 8:
35464 // These scales always work.
35465 break;
35466 case 3:
35467 case 5:
35468 case 9:
35469 // These scales are formed with basereg+scalereg. Only accept if there is
35470 // no basereg yet.
35471 if (AM.HasBaseReg)
35472 return false;
35473 break;
35474 default: // Other stuff never works.
35475 return false;
35476 }
35477
35478 return true;
35479}
35480
35481bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35482 switch (Opcode) {
35483 // These are non-commutative binops.
35484 // TODO: Add more X86ISD opcodes once we have test coverage.
35485 case X86ISD::ANDNP:
35486 case X86ISD::PCMPGT:
35487 case X86ISD::FMAX:
35488 case X86ISD::FMIN:
35489 case X86ISD::FANDN:
35490 case X86ISD::VPSHA:
35491 case X86ISD::VPSHL:
35492 case X86ISD::VSHLV:
35493 case X86ISD::VSRLV:
35494 case X86ISD::VSRAV:
35495 return true;
35496 }
35497
35498 return TargetLoweringBase::isBinOp(Opcode);
35499}
35500
35501bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35502 switch (Opcode) {
35503 // TODO: Add more X86ISD opcodes once we have test coverage.
35504 case X86ISD::PCMPEQ:
35505 case X86ISD::PMULDQ:
35506 case X86ISD::PMULUDQ:
35507 case X86ISD::FMAXC:
35508 case X86ISD::FMINC:
35509 case X86ISD::FAND:
35510 case X86ISD::FOR:
35511 case X86ISD::FXOR:
35512 return true;
35513 }
35514
35516}
35517
35519 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35520 return false;
35521 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35522 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35523 return NumBits1 > NumBits2;
35524}
35525
35527 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35528 return false;
35529
35530 if (!isTypeLegal(EVT::getEVT(Ty1)))
35531 return false;
35532
35533 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35534
35535 // Assuming the caller doesn't have a zeroext or signext return parameter,
35536 // truncation all the way down to i1 is valid.
35537 return true;
35538}
35539
35541 return isInt<32>(Imm);
35542}
35543
35545 // Can also use sub to handle negated immediates.
35546 return isInt<32>(Imm);
35547}
35548
35550 return isInt<32>(Imm);
35551}
35552
35554 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35555 return false;
35556 unsigned NumBits1 = VT1.getSizeInBits();
35557 unsigned NumBits2 = VT2.getSizeInBits();
35558 return NumBits1 > NumBits2;
35559}
35560
35562 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35563 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35564}
35565
35567 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35568 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35569}
35570
35572 EVT VT1 = Val.getValueType();
35573 if (isZExtFree(VT1, VT2))
35574 return true;
35575
35576 if (Val.getOpcode() != ISD::LOAD)
35577 return false;
35578
35579 if (!VT1.isSimple() || !VT1.isInteger() ||
35580 !VT2.isSimple() || !VT2.isInteger())
35581 return false;
35582
35583 switch (VT1.getSimpleVT().SimpleTy) {
35584 default: break;
35585 case MVT::i8:
35586 case MVT::i16:
35587 case MVT::i32:
35588 // X86 has 8, 16, and 32-bit zero-extending loads.
35589 return true;
35590 }
35591
35592 return false;
35593}
35594
35596 if (!Subtarget.is64Bit())
35597 return false;
35598 return TargetLowering::shouldConvertPhiType(From, To);
35599}
35600
35602 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35603 return false;
35604
35605 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35606
35607 // There is no extending load for vXi1.
35608 if (SrcVT.getScalarType() == MVT::i1)
35609 return false;
35610
35611 return true;
35612}
35613
35615 EVT VT) const {
35616 if (Subtarget.useSoftFloat())
35617 return false;
35618
35619 if (!Subtarget.hasAnyFMA())
35620 return false;
35621
35622 VT = VT.getScalarType();
35623
35624 if (!VT.isSimple())
35625 return false;
35626
35627 switch (VT.getSimpleVT().SimpleTy) {
35628 case MVT::f16:
35629 return Subtarget.hasFP16();
35630 case MVT::f32:
35631 case MVT::f64:
35632 return true;
35633 default:
35634 break;
35635 }
35636
35637 return false;
35638}
35639
35641 EVT DestVT) const {
35642 // i16 instructions are longer (0x66 prefix) and potentially slower.
35643 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35644}
35645
35647 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35648 SDValue Y) const {
35649 if (SelectOpcode == ISD::SELECT) {
35650 if (VT.isVector())
35651 return false;
35652 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35653 return false;
35654 using namespace llvm::SDPatternMatch;
35655 // BLSI
35656 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35658 return true;
35659 // BLSR
35660 if (BinOpcode == ISD::AND &&
35663 return true;
35664 // BLSMSK
35665 if (BinOpcode == ISD::XOR &&
35668 return true;
35669
35670 return false;
35671 }
35672 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35673 // benefit. The transform may also be profitable for scalar code.
35674 if (!Subtarget.hasAVX512())
35675 return false;
35676 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35677 return false;
35678 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35679 return false;
35680
35681 return true;
35682}
35683
35684/// Targets can use this to indicate that they only support *some*
35685/// VECTOR_SHUFFLE operations, those with specific masks.
35686/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35687/// are assumed to be legal.
35689 if (!VT.isSimple())
35690 return false;
35691
35692 // Not for i1 vectors
35693 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35694 return false;
35695
35696 // Very little shuffling can be done for 64-bit vectors right now.
35697 if (VT.getSimpleVT().getSizeInBits() == 64)
35698 return false;
35699
35700 // We only care that the types being shuffled are legal. The lowering can
35701 // handle any possible shuffle mask that results.
35702 return isTypeLegal(VT.getSimpleVT());
35703}
35704
35706 EVT VT) const {
35707 // Don't convert an 'and' into a shuffle that we don't directly support.
35708 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35709 if (!Subtarget.hasAVX2())
35710 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35711 return false;
35712
35713 // Just delegate to the generic legality, clear masks aren't special.
35714 return isShuffleMaskLegal(Mask, VT);
35715}
35716
35718 // If the subtarget is using thunks, we need to not generate jump tables.
35719 if (Subtarget.useIndirectThunkBranches())
35720 return false;
35721
35722 // Otherwise, fallback on the generic logic.
35724}
35725
35727 EVT ConditionVT) const {
35728 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35729 // zero-extensions.
35730 if (ConditionVT.getSizeInBits() < 32)
35731 return MVT::i32;
35733 ConditionVT);
35734}
35735
35736//===----------------------------------------------------------------------===//
35737// X86 Scheduler Hooks
35738//===----------------------------------------------------------------------===//
35739
35740/// Utility function to emit xbegin specifying the start of an RTM region.
35742 const TargetInstrInfo *TII) {
35743 const MIMetadata MIMD(MI);
35744
35745 const BasicBlock *BB = MBB->getBasicBlock();
35746 MachineFunction::iterator I = ++MBB->getIterator();
35747
35748 // For the v = xbegin(), we generate
35749 //
35750 // thisMBB:
35751 // xbegin sinkMBB
35752 //
35753 // mainMBB:
35754 // s0 = -1
35755 //
35756 // fallBB:
35757 // eax = # XABORT_DEF
35758 // s1 = eax
35759 //
35760 // sinkMBB:
35761 // v = phi(s0/mainBB, s1/fallBB)
35762
35763 MachineBasicBlock *thisMBB = MBB;
35764 MachineFunction *MF = MBB->getParent();
35765 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35766 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35767 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35768 MF->insert(I, mainMBB);
35769 MF->insert(I, fallMBB);
35770 MF->insert(I, sinkMBB);
35771
35772 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35773 mainMBB->addLiveIn(X86::EFLAGS);
35774 fallMBB->addLiveIn(X86::EFLAGS);
35775 sinkMBB->addLiveIn(X86::EFLAGS);
35776 }
35777
35778 // Transfer the remainder of BB and its successor edges to sinkMBB.
35779 sinkMBB->splice(sinkMBB->begin(), MBB,
35780 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35782
35784 Register DstReg = MI.getOperand(0).getReg();
35785 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35786 Register mainDstReg = MRI.createVirtualRegister(RC);
35787 Register fallDstReg = MRI.createVirtualRegister(RC);
35788
35789 // thisMBB:
35790 // xbegin fallMBB
35791 // # fallthrough to mainMBB
35792 // # abortion to fallMBB
35793 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35794 thisMBB->addSuccessor(mainMBB);
35795 thisMBB->addSuccessor(fallMBB);
35796
35797 // mainMBB:
35798 // mainDstReg := -1
35799 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35800 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35801 mainMBB->addSuccessor(sinkMBB);
35802
35803 // fallMBB:
35804 // ; pseudo instruction to model hardware's definition from XABORT
35805 // EAX := XABORT_DEF
35806 // fallDstReg := EAX
35807 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35808 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35809 .addReg(X86::EAX);
35810 fallMBB->addSuccessor(sinkMBB);
35811
35812 // sinkMBB:
35813 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35814 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35815 .addReg(mainDstReg).addMBB(mainMBB)
35816 .addReg(fallDstReg).addMBB(fallMBB);
35817
35818 MI.eraseFromParent();
35819 return sinkMBB;
35820}
35821
35823X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35824 MachineBasicBlock *MBB) const {
35825 // Emit va_arg instruction on X86-64.
35826
35827 // Operands to this pseudo-instruction:
35828 // 0 ) Output : destination address (reg)
35829 // 1-5) Input : va_list address (addr, i64mem)
35830 // 6 ) ArgSize : Size (in bytes) of vararg type
35831 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35832 // 8 ) Align : Alignment of type
35833 // 9 ) EFLAGS (implicit-def)
35834
35835 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35836 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35837
35838 Register DestReg = MI.getOperand(0).getReg();
35839 MachineOperand &Base = MI.getOperand(1);
35840 MachineOperand &Scale = MI.getOperand(2);
35841 MachineOperand &Index = MI.getOperand(3);
35842 MachineOperand &Disp = MI.getOperand(4);
35843 MachineOperand &Segment = MI.getOperand(5);
35844 unsigned ArgSize = MI.getOperand(6).getImm();
35845 unsigned ArgMode = MI.getOperand(7).getImm();
35846 Align Alignment = Align(MI.getOperand(8).getImm());
35847
35848 MachineFunction *MF = MBB->getParent();
35849
35850 // Memory Reference
35851 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35852
35853 MachineMemOperand *OldMMO = MI.memoperands().front();
35854
35855 // Clone the MMO into two separate MMOs for loading and storing
35856 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35857 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35858 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35859 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35860
35861 // Machine Information
35862 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35863 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35864 const TargetRegisterClass *AddrRegClass =
35866 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35867 const MIMetadata MIMD(MI);
35868
35869 // struct va_list {
35870 // i32 gp_offset
35871 // i32 fp_offset
35872 // i64 overflow_area (address)
35873 // i64 reg_save_area (address)
35874 // }
35875 // sizeof(va_list) = 24
35876 // alignment(va_list) = 8
35877
35878 unsigned TotalNumIntRegs = 6;
35879 unsigned TotalNumXMMRegs = 8;
35880 bool UseGPOffset = (ArgMode == 1);
35881 bool UseFPOffset = (ArgMode == 2);
35882 unsigned MaxOffset = TotalNumIntRegs * 8 +
35883 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35884
35885 /* Align ArgSize to a multiple of 8 */
35886 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35887 bool NeedsAlign = (Alignment > 8);
35888
35889 MachineBasicBlock *thisMBB = MBB;
35890 MachineBasicBlock *overflowMBB;
35891 MachineBasicBlock *offsetMBB;
35892 MachineBasicBlock *endMBB;
35893
35894 Register OffsetDestReg; // Argument address computed by offsetMBB
35895 Register OverflowDestReg; // Argument address computed by overflowMBB
35896 Register OffsetReg;
35897
35898 if (!UseGPOffset && !UseFPOffset) {
35899 // If we only pull from the overflow region, we don't create a branch.
35900 // We don't need to alter control flow.
35901 OffsetDestReg = Register(); // unused
35902 OverflowDestReg = DestReg;
35903
35904 offsetMBB = nullptr;
35905 overflowMBB = thisMBB;
35906 endMBB = thisMBB;
35907 } else {
35908 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35909 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35910 // If not, pull from overflow_area. (branch to overflowMBB)
35911 //
35912 // thisMBB
35913 // | .
35914 // | .
35915 // offsetMBB overflowMBB
35916 // | .
35917 // | .
35918 // endMBB
35919
35920 // Registers for the PHI in endMBB
35921 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35922 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35923
35924 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35925 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35926 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35927 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35928
35930
35931 // Insert the new basic blocks
35932 MF->insert(MBBIter, offsetMBB);
35933 MF->insert(MBBIter, overflowMBB);
35934 MF->insert(MBBIter, endMBB);
35935
35936 // Transfer the remainder of MBB and its successor edges to endMBB.
35937 endMBB->splice(endMBB->begin(), thisMBB,
35938 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35939 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35940
35941 // Make offsetMBB and overflowMBB successors of thisMBB
35942 thisMBB->addSuccessor(offsetMBB);
35943 thisMBB->addSuccessor(overflowMBB);
35944
35945 // endMBB is a successor of both offsetMBB and overflowMBB
35946 offsetMBB->addSuccessor(endMBB);
35947 overflowMBB->addSuccessor(endMBB);
35948
35949 // Load the offset value into a register
35950 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35951 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35952 .add(Base)
35953 .add(Scale)
35954 .add(Index)
35955 .addDisp(Disp, UseFPOffset ? 4 : 0)
35956 .add(Segment)
35957 .setMemRefs(LoadOnlyMMO);
35958
35959 // Check if there is enough room left to pull this argument.
35960 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35961 .addReg(OffsetReg)
35962 .addImm(MaxOffset + 8 - ArgSizeA8);
35963
35964 // Branch to "overflowMBB" if offset >= max
35965 // Fall through to "offsetMBB" otherwise
35966 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35967 .addMBB(overflowMBB).addImm(X86::COND_AE);
35968 }
35969
35970 // In offsetMBB, emit code to use the reg_save_area.
35971 if (offsetMBB) {
35972 assert(OffsetReg != 0);
35973
35974 // Read the reg_save_area address.
35975 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35976 BuildMI(
35977 offsetMBB, MIMD,
35978 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35979 RegSaveReg)
35980 .add(Base)
35981 .add(Scale)
35982 .add(Index)
35983 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35984 .add(Segment)
35985 .setMemRefs(LoadOnlyMMO);
35986
35987 if (Subtarget.isTarget64BitLP64()) {
35988 // Zero-extend the offset
35989 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35990 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35991 .addImm(0)
35992 .addReg(OffsetReg)
35993 .addImm(X86::sub_32bit);
35994
35995 // Add the offset to the reg_save_area to get the final address.
35996 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35997 .addReg(OffsetReg64)
35998 .addReg(RegSaveReg);
35999 } else {
36000 // Add the offset to the reg_save_area to get the final address.
36001 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36002 .addReg(OffsetReg)
36003 .addReg(RegSaveReg);
36004 }
36005
36006 // Compute the offset for the next argument
36007 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36008 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36009 .addReg(OffsetReg)
36010 .addImm(UseFPOffset ? 16 : 8);
36011
36012 // Store it back into the va_list.
36013 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36014 .add(Base)
36015 .add(Scale)
36016 .add(Index)
36017 .addDisp(Disp, UseFPOffset ? 4 : 0)
36018 .add(Segment)
36019 .addReg(NextOffsetReg)
36020 .setMemRefs(StoreOnlyMMO);
36021
36022 // Jump to endMBB
36023 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36024 .addMBB(endMBB);
36025 }
36026
36027 //
36028 // Emit code to use overflow area
36029 //
36030
36031 // Load the overflow_area address into a register.
36032 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36033 BuildMI(overflowMBB, MIMD,
36034 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36035 OverflowAddrReg)
36036 .add(Base)
36037 .add(Scale)
36038 .add(Index)
36039 .addDisp(Disp, 8)
36040 .add(Segment)
36041 .setMemRefs(LoadOnlyMMO);
36042
36043 // If we need to align it, do so. Otherwise, just copy the address
36044 // to OverflowDestReg.
36045 if (NeedsAlign) {
36046 // Align the overflow address
36047 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36048
36049 // aligned_addr = (addr + (align-1)) & ~(align-1)
36050 BuildMI(
36051 overflowMBB, MIMD,
36052 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36053 TmpReg)
36054 .addReg(OverflowAddrReg)
36055 .addImm(Alignment.value() - 1);
36056
36057 BuildMI(
36058 overflowMBB, MIMD,
36059 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36060 OverflowDestReg)
36061 .addReg(TmpReg)
36062 .addImm(~(uint64_t)(Alignment.value() - 1));
36063 } else {
36064 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36065 .addReg(OverflowAddrReg);
36066 }
36067
36068 // Compute the next overflow address after this argument.
36069 // (the overflow address should be kept 8-byte aligned)
36070 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36071 BuildMI(
36072 overflowMBB, MIMD,
36073 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36074 NextAddrReg)
36075 .addReg(OverflowDestReg)
36076 .addImm(ArgSizeA8);
36077
36078 // Store the new overflow address.
36079 BuildMI(overflowMBB, MIMD,
36080 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36081 .add(Base)
36082 .add(Scale)
36083 .add(Index)
36084 .addDisp(Disp, 8)
36085 .add(Segment)
36086 .addReg(NextAddrReg)
36087 .setMemRefs(StoreOnlyMMO);
36088
36089 // If we branched, emit the PHI to the front of endMBB.
36090 if (offsetMBB) {
36091 BuildMI(*endMBB, endMBB->begin(), MIMD,
36092 TII->get(X86::PHI), DestReg)
36093 .addReg(OffsetDestReg).addMBB(offsetMBB)
36094 .addReg(OverflowDestReg).addMBB(overflowMBB);
36095 }
36096
36097 // Erase the pseudo instruction
36098 MI.eraseFromParent();
36099
36100 return endMBB;
36101}
36102
36103// The EFLAGS operand of SelectItr might be missing a kill marker
36104// because there were multiple uses of EFLAGS, and ISel didn't know
36105// which to mark. Figure out whether SelectItr should have had a
36106// kill marker, and set it if it should. Returns the correct kill
36107// marker value.
36110 const TargetRegisterInfo* TRI) {
36111 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36112 return false;
36113
36114 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36115 // out. SelectMI should have a kill flag on EFLAGS.
36116 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36117 return true;
36118}
36119
36120// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36121// together with other CMOV pseudo-opcodes into a single basic-block with
36122// conditional jump around it.
36124 switch (MI.getOpcode()) {
36125 case X86::CMOV_FR16:
36126 case X86::CMOV_FR16X:
36127 case X86::CMOV_FR32:
36128 case X86::CMOV_FR32X:
36129 case X86::CMOV_FR64:
36130 case X86::CMOV_FR64X:
36131 case X86::CMOV_GR8:
36132 case X86::CMOV_GR16:
36133 case X86::CMOV_GR32:
36134 case X86::CMOV_RFP32:
36135 case X86::CMOV_RFP64:
36136 case X86::CMOV_RFP80:
36137 case X86::CMOV_VR64:
36138 case X86::CMOV_VR128:
36139 case X86::CMOV_VR128X:
36140 case X86::CMOV_VR256:
36141 case X86::CMOV_VR256X:
36142 case X86::CMOV_VR512:
36143 case X86::CMOV_VK1:
36144 case X86::CMOV_VK2:
36145 case X86::CMOV_VK4:
36146 case X86::CMOV_VK8:
36147 case X86::CMOV_VK16:
36148 case X86::CMOV_VK32:
36149 case X86::CMOV_VK64:
36150 return true;
36151
36152 default:
36153 return false;
36154 }
36155}
36156
36157// Helper function, which inserts PHI functions into SinkMBB:
36158// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36159// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36160// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36161// the last PHI function inserted.
36164 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36165 MachineBasicBlock *SinkMBB) {
36166 MachineFunction *MF = TrueMBB->getParent();
36168 const MIMetadata MIMD(*MIItBegin);
36169
36170 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36172
36173 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36174
36175 // As we are creating the PHIs, we have to be careful if there is more than
36176 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36177 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36178 // That also means that PHI construction must work forward from earlier to
36179 // later, and that the code must maintain a mapping from earlier PHI's
36180 // destination registers, and the registers that went into the PHI.
36183
36184 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36185 Register DestReg = MIIt->getOperand(0).getReg();
36186 Register Op1Reg = MIIt->getOperand(1).getReg();
36187 Register Op2Reg = MIIt->getOperand(2).getReg();
36188
36189 // If this CMOV we are generating is the opposite condition from
36190 // the jump we generated, then we have to swap the operands for the
36191 // PHI that is going to be generated.
36192 if (MIIt->getOperand(3).getImm() == OppCC)
36193 std::swap(Op1Reg, Op2Reg);
36194
36195 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36196 Op1Reg = It->second.first;
36197
36198 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36199 Op2Reg = It->second.second;
36200
36201 MIB =
36202 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36203 .addReg(Op1Reg)
36204 .addMBB(FalseMBB)
36205 .addReg(Op2Reg)
36206 .addMBB(TrueMBB);
36207
36208 // Add this PHI to the rewrite table.
36209 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36210 }
36211
36212 return MIB;
36213}
36214
36215// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36217X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36218 MachineInstr &SecondCascadedCMOV,
36219 MachineBasicBlock *ThisMBB) const {
36220 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36221 const MIMetadata MIMD(FirstCMOV);
36222
36223 // We lower cascaded CMOVs such as
36224 //
36225 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36226 //
36227 // to two successive branches.
36228 //
36229 // Without this, we would add a PHI between the two jumps, which ends up
36230 // creating a few copies all around. For instance, for
36231 //
36232 // (sitofp (zext (fcmp une)))
36233 //
36234 // we would generate:
36235 //
36236 // ucomiss %xmm1, %xmm0
36237 // movss <1.0f>, %xmm0
36238 // movaps %xmm0, %xmm1
36239 // jne .LBB5_2
36240 // xorps %xmm1, %xmm1
36241 // .LBB5_2:
36242 // jp .LBB5_4
36243 // movaps %xmm1, %xmm0
36244 // .LBB5_4:
36245 // retq
36246 //
36247 // because this custom-inserter would have generated:
36248 //
36249 // A
36250 // | \
36251 // | B
36252 // | /
36253 // C
36254 // | \
36255 // | D
36256 // | /
36257 // E
36258 //
36259 // A: X = ...; Y = ...
36260 // B: empty
36261 // C: Z = PHI [X, A], [Y, B]
36262 // D: empty
36263 // E: PHI [X, C], [Z, D]
36264 //
36265 // If we lower both CMOVs in a single step, we can instead generate:
36266 //
36267 // A
36268 // | \
36269 // | C
36270 // | /|
36271 // |/ |
36272 // | |
36273 // | D
36274 // | /
36275 // E
36276 //
36277 // A: X = ...; Y = ...
36278 // D: empty
36279 // E: PHI [X, A], [X, C], [Y, D]
36280 //
36281 // Which, in our sitofp/fcmp example, gives us something like:
36282 //
36283 // ucomiss %xmm1, %xmm0
36284 // movss <1.0f>, %xmm0
36285 // jne .LBB5_4
36286 // jp .LBB5_4
36287 // xorps %xmm0, %xmm0
36288 // .LBB5_4:
36289 // retq
36290 //
36291
36292 // We lower cascaded CMOV into two successive branches to the same block.
36293 // EFLAGS is used by both, so mark it as live in the second.
36294 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36295 MachineFunction *F = ThisMBB->getParent();
36296 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36297 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36298 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36299
36300 MachineFunction::iterator It = ++ThisMBB->getIterator();
36301 F->insert(It, FirstInsertedMBB);
36302 F->insert(It, SecondInsertedMBB);
36303 F->insert(It, SinkMBB);
36304
36305 // For a cascaded CMOV, we lower it to two successive branches to
36306 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36307 // the FirstInsertedMBB.
36308 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36309
36310 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36311 // live into the sink and copy blocks.
36312 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36313 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36314 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36315 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36316 SinkMBB->addLiveIn(X86::EFLAGS);
36317 }
36318
36319 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36320 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36321 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36322 ThisMBB->end());
36323 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36324
36325 // Fallthrough block for ThisMBB.
36326 ThisMBB->addSuccessor(FirstInsertedMBB);
36327 // The true block target of the first branch is always SinkMBB.
36328 ThisMBB->addSuccessor(SinkMBB);
36329 // Fallthrough block for FirstInsertedMBB.
36330 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36331 // The true block for the branch of FirstInsertedMBB.
36332 FirstInsertedMBB->addSuccessor(SinkMBB);
36333 // This is fallthrough.
36334 SecondInsertedMBB->addSuccessor(SinkMBB);
36335
36336 // Create the conditional branch instructions.
36337 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36338 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36339
36340 X86::CondCode SecondCC =
36341 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36342 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36343 .addMBB(SinkMBB)
36344 .addImm(SecondCC);
36345
36346 // SinkMBB:
36347 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36348 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36349 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36350 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36351 MachineInstrBuilder MIB =
36352 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36353 .addReg(Op1Reg)
36354 .addMBB(SecondInsertedMBB)
36355 .addReg(Op2Reg)
36356 .addMBB(ThisMBB);
36357
36358 // The second SecondInsertedMBB provides the same incoming value as the
36359 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36360 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36361
36362 // Now remove the CMOVs.
36363 FirstCMOV.eraseFromParent();
36364 SecondCascadedCMOV.eraseFromParent();
36365
36366 return SinkMBB;
36367}
36368
36370X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36371 MachineBasicBlock *ThisMBB) const {
36372 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36373 const MIMetadata MIMD(MI);
36374
36375 // To "insert" a SELECT_CC instruction, we actually have to insert the
36376 // diamond control-flow pattern. The incoming instruction knows the
36377 // destination vreg to set, the condition code register to branch on, the
36378 // true/false values to select between and a branch opcode to use.
36379
36380 // ThisMBB:
36381 // ...
36382 // TrueVal = ...
36383 // cmpTY ccX, r1, r2
36384 // bCC copy1MBB
36385 // fallthrough --> FalseMBB
36386
36387 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36388 // as described above, by inserting a BB, and then making a PHI at the join
36389 // point to select the true and false operands of the CMOV in the PHI.
36390 //
36391 // The code also handles two different cases of multiple CMOV opcodes
36392 // in a row.
36393 //
36394 // Case 1:
36395 // In this case, there are multiple CMOVs in a row, all which are based on
36396 // the same condition setting (or the exact opposite condition setting).
36397 // In this case we can lower all the CMOVs using a single inserted BB, and
36398 // then make a number of PHIs at the join point to model the CMOVs. The only
36399 // trickiness here, is that in a case like:
36400 //
36401 // t2 = CMOV cond1 t1, f1
36402 // t3 = CMOV cond1 t2, f2
36403 //
36404 // when rewriting this into PHIs, we have to perform some renaming on the
36405 // temps since you cannot have a PHI operand refer to a PHI result earlier
36406 // in the same block. The "simple" but wrong lowering would be:
36407 //
36408 // t2 = PHI t1(BB1), f1(BB2)
36409 // t3 = PHI t2(BB1), f2(BB2)
36410 //
36411 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36412 // renaming is to note that on the path through BB1, t2 is really just a
36413 // copy of t1, and do that renaming, properly generating:
36414 //
36415 // t2 = PHI t1(BB1), f1(BB2)
36416 // t3 = PHI t1(BB1), f2(BB2)
36417 //
36418 // Case 2:
36419 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36420 // function - EmitLoweredCascadedSelect.
36421
36422 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36424 MachineInstr *LastCMOV = &MI;
36426
36427 // Check for case 1, where there are multiple CMOVs with the same condition
36428 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36429 // number of jumps the most.
36430
36431 if (isCMOVPseudo(MI)) {
36432 // See if we have a string of CMOVS with the same condition. Skip over
36433 // intervening debug insts.
36434 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36435 (NextMIIt->getOperand(3).getImm() == CC ||
36436 NextMIIt->getOperand(3).getImm() == OppCC)) {
36437 LastCMOV = &*NextMIIt;
36438 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36439 }
36440 }
36441
36442 // This checks for case 2, but only do this if we didn't already find
36443 // case 1, as indicated by LastCMOV == MI.
36444 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36445 NextMIIt->getOpcode() == MI.getOpcode() &&
36446 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36447 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36448 NextMIIt->getOperand(1).isKill()) {
36449 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36450 }
36451
36452 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36453 MachineFunction *F = ThisMBB->getParent();
36454 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36455 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36456
36457 MachineFunction::iterator It = ++ThisMBB->getIterator();
36458 F->insert(It, FalseMBB);
36459 F->insert(It, SinkMBB);
36460
36461 // Set the call frame size on entry to the new basic blocks.
36462 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36463 FalseMBB->setCallFrameSize(CallFrameSize);
36464 SinkMBB->setCallFrameSize(CallFrameSize);
36465
36466 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36467 // live into the sink and copy blocks.
36468 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36469 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36470 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36471 FalseMBB->addLiveIn(X86::EFLAGS);
36472 SinkMBB->addLiveIn(X86::EFLAGS);
36473 }
36474
36475 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36477 MachineBasicBlock::iterator(LastCMOV));
36478 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36479 if (MI.isDebugInstr())
36480 SinkMBB->push_back(MI.removeFromParent());
36481
36482 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36483 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36484 std::next(MachineBasicBlock::iterator(LastCMOV)),
36485 ThisMBB->end());
36486 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36487
36488 // Fallthrough block for ThisMBB.
36489 ThisMBB->addSuccessor(FalseMBB);
36490 // The true block target of the first (or only) branch is always a SinkMBB.
36491 ThisMBB->addSuccessor(SinkMBB);
36492 // Fallthrough block for FalseMBB.
36493 FalseMBB->addSuccessor(SinkMBB);
36494
36495 // Create the conditional branch instruction.
36496 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36497
36498 // SinkMBB:
36499 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36500 // ...
36503 std::next(MachineBasicBlock::iterator(LastCMOV));
36504 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36505
36506 // Now remove the CMOV(s).
36507 ThisMBB->erase(MIItBegin, MIItEnd);
36508
36509 return SinkMBB;
36510}
36511
36512static unsigned getSUBriOpcode(bool IsLP64) {
36513 if (IsLP64)
36514 return X86::SUB64ri32;
36515 else
36516 return X86::SUB32ri;
36517}
36518
36520X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36521 MachineBasicBlock *MBB) const {
36522 MachineFunction *MF = MBB->getParent();
36523 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36524 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36525 const MIMetadata MIMD(MI);
36526 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36527
36528 const unsigned ProbeSize = getStackProbeSize(*MF);
36529
36530 MachineRegisterInfo &MRI = MF->getRegInfo();
36531 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36532 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36533 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36534
36536 MF->insert(MBBIter, testMBB);
36537 MF->insert(MBBIter, blockMBB);
36538 MF->insert(MBBIter, tailMBB);
36539
36540 Register sizeVReg = MI.getOperand(1).getReg();
36541
36542 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36543
36544 Register TmpStackPtr = MRI.createVirtualRegister(
36545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36546 Register FinalStackPtr = MRI.createVirtualRegister(
36547 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36548
36549 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36550 .addReg(physSPReg);
36551 {
36552 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36553 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36554 .addReg(TmpStackPtr)
36555 .addReg(sizeVReg);
36556 }
36557
36558 // test rsp size
36559
36560 BuildMI(testMBB, MIMD,
36561 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36562 .addReg(FinalStackPtr)
36563 .addReg(physSPReg);
36564
36565 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36566 .addMBB(tailMBB)
36568 testMBB->addSuccessor(blockMBB);
36569 testMBB->addSuccessor(tailMBB);
36570
36571 // Touch the block then extend it. This is done on the opposite side of
36572 // static probe where we allocate then touch, to avoid the need of probing the
36573 // tail of the static alloca. Possible scenarios are:
36574 //
36575 // + ---- <- ------------ <- ------------- <- ------------ +
36576 // | |
36577 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36578 // | |
36579 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36580 //
36581 // The property we want to enforce is to never have more than [page alloc] between two probes.
36582
36583 const unsigned XORMIOpc =
36584 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36585 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36586 .addImm(0);
36587
36588 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36589 physSPReg)
36590 .addReg(physSPReg)
36591 .addImm(ProbeSize);
36592
36593 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36594 blockMBB->addSuccessor(testMBB);
36595
36596 // Replace original instruction by the expected stack ptr
36597 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36598 MI.getOperand(0).getReg())
36599 .addReg(FinalStackPtr);
36600
36601 tailMBB->splice(tailMBB->end(), MBB,
36602 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36604 MBB->addSuccessor(testMBB);
36605
36606 // Delete the original pseudo instruction.
36607 MI.eraseFromParent();
36608
36609 // And we're done.
36610 return tailMBB;
36611}
36612
36614X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36615 MachineBasicBlock *BB) const {
36616 MachineFunction *MF = BB->getParent();
36617 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36618 const MIMetadata MIMD(MI);
36619 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36620
36621 assert(MF->shouldSplitStack());
36622
36623 const bool Is64Bit = Subtarget.is64Bit();
36624 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36625
36626 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36627 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36628
36629 // BB:
36630 // ... [Till the alloca]
36631 // If stacklet is not large enough, jump to mallocMBB
36632 //
36633 // bumpMBB:
36634 // Allocate by subtracting from RSP
36635 // Jump to continueMBB
36636 //
36637 // mallocMBB:
36638 // Allocate by call to runtime
36639 //
36640 // continueMBB:
36641 // ...
36642 // [rest of original BB]
36643 //
36644
36645 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36646 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36647 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36648
36649 MachineRegisterInfo &MRI = MF->getRegInfo();
36650 const TargetRegisterClass *AddrRegClass =
36652
36653 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36654 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36655 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36656 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36657 sizeVReg = MI.getOperand(1).getReg(),
36658 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36659
36660 MachineFunction::iterator MBBIter = ++BB->getIterator();
36661
36662 MF->insert(MBBIter, bumpMBB);
36663 MF->insert(MBBIter, mallocMBB);
36664 MF->insert(MBBIter, continueMBB);
36665
36666 continueMBB->splice(continueMBB->begin(), BB,
36667 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36668 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36669
36670 // Add code to the main basic block to check if the stack limit has been hit,
36671 // and if so, jump to mallocMBB otherwise to bumpMBB.
36672 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36673 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36674 .addReg(tmpSPVReg).addReg(sizeVReg);
36675 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36676 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36677 .addReg(SPLimitVReg);
36678 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36679
36680 // bumpMBB simply decreases the stack pointer, since we know the current
36681 // stacklet has enough space.
36682 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36683 .addReg(SPLimitVReg);
36684 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36685 .addReg(SPLimitVReg);
36686 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36687
36688 // Calls into a routine in libgcc to allocate more space from the heap.
36689 const uint32_t *RegMask =
36690 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36691 if (IsLP64) {
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36693 .addReg(sizeVReg);
36694 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36695 .addExternalSymbol("__morestack_allocate_stack_space")
36696 .addRegMask(RegMask)
36697 .addReg(X86::RDI, RegState::Implicit)
36698 .addReg(X86::RAX, RegState::ImplicitDefine);
36699 } else if (Is64Bit) {
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36701 .addReg(sizeVReg);
36702 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36703 .addExternalSymbol("__morestack_allocate_stack_space")
36704 .addRegMask(RegMask)
36705 .addReg(X86::EDI, RegState::Implicit)
36706 .addReg(X86::EAX, RegState::ImplicitDefine);
36707 } else {
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36709 .addImm(12);
36710 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36711 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36712 .addExternalSymbol("__morestack_allocate_stack_space")
36713 .addRegMask(RegMask)
36714 .addReg(X86::EAX, RegState::ImplicitDefine);
36715 }
36716
36717 if (!Is64Bit)
36718 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36719 .addImm(16);
36720
36721 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36722 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36723 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36724
36725 // Set up the CFG correctly.
36726 BB->addSuccessor(bumpMBB);
36727 BB->addSuccessor(mallocMBB);
36728 mallocMBB->addSuccessor(continueMBB);
36729 bumpMBB->addSuccessor(continueMBB);
36730
36731 // Take care of the PHI nodes.
36732 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36733 MI.getOperand(0).getReg())
36734 .addReg(mallocPtrVReg)
36735 .addMBB(mallocMBB)
36736 .addReg(bumpSPPtrVReg)
36737 .addMBB(bumpMBB);
36738
36739 // Delete the original pseudo instruction.
36740 MI.eraseFromParent();
36741
36742 // And we're done.
36743 return continueMBB;
36744}
36745
36747X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36748 MachineBasicBlock *BB) const {
36749 MachineFunction *MF = BB->getParent();
36750 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36751 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36752 const MIMetadata MIMD(MI);
36753
36756 "SEH does not use catchret!");
36757
36758 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36759 if (!Subtarget.is32Bit())
36760 return BB;
36761
36762 // C++ EH creates a new target block to hold the restore code, and wires up
36763 // the new block to the return destination with a normal JMP_4.
36764 MachineBasicBlock *RestoreMBB =
36766 assert(BB->succ_size() == 1);
36767 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36768 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36769 BB->addSuccessor(RestoreMBB);
36770 MI.getOperand(0).setMBB(RestoreMBB);
36771
36772 // Marking this as an EH pad but not a funclet entry block causes PEI to
36773 // restore stack pointers in the block.
36774 RestoreMBB->setIsEHPad(true);
36775
36776 auto RestoreMBBI = RestoreMBB->begin();
36777 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36778 return BB;
36779}
36780
36782X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36783 MachineBasicBlock *BB) const {
36784 // This is pretty easy. We're taking the value that we received from
36785 // our load from the relocation, sticking it in either RDI (x86-64)
36786 // or EAX and doing an indirect call. The return value will then
36787 // be in the normal return register.
36788 MachineFunction *F = BB->getParent();
36789 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36790 const MIMetadata MIMD(MI);
36791
36792 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36793 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36794
36795 // Get a register mask for the lowered call.
36796 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36797 // proper register mask.
36798 const uint32_t *RegMask =
36799 Subtarget.is64Bit() ?
36800 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36801 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36802 if (Subtarget.is64Bit()) {
36803 MachineInstrBuilder MIB =
36804 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36805 .addReg(X86::RIP)
36806 .addImm(0)
36807 .addReg(0)
36808 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36809 MI.getOperand(3).getTargetFlags())
36810 .addReg(0);
36811 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36812 addDirectMem(MIB, X86::RDI);
36813 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36814 } else if (!isPositionIndependent()) {
36815 MachineInstrBuilder MIB =
36816 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36817 .addReg(0)
36818 .addImm(0)
36819 .addReg(0)
36820 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36821 MI.getOperand(3).getTargetFlags())
36822 .addReg(0);
36823 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36824 addDirectMem(MIB, X86::EAX);
36825 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36826 } else {
36827 MachineInstrBuilder MIB =
36828 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36829 .addReg(TII->getGlobalBaseReg(F))
36830 .addImm(0)
36831 .addReg(0)
36832 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36833 MI.getOperand(3).getTargetFlags())
36834 .addReg(0);
36835 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36836 addDirectMem(MIB, X86::EAX);
36837 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36838 }
36839
36840 MI.eraseFromParent(); // The pseudo instruction is gone now.
36841 return BB;
36842}
36843
36844static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36845 switch (RPOpc) {
36846 case X86::INDIRECT_THUNK_CALL32:
36847 return X86::CALLpcrel32;
36848 case X86::INDIRECT_THUNK_CALL64:
36849 return X86::CALL64pcrel32;
36850 case X86::INDIRECT_THUNK_TCRETURN32:
36851 return X86::TCRETURNdi;
36852 case X86::INDIRECT_THUNK_TCRETURN64:
36853 return X86::TCRETURNdi64;
36854 }
36855 llvm_unreachable("not indirect thunk opcode");
36856}
36857
36858static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36859 Register Reg) {
36860 if (Subtarget.useRetpolineExternalThunk()) {
36861 // When using an external thunk for retpolines, we pick names that match the
36862 // names GCC happens to use as well. This helps simplify the implementation
36863 // of the thunks for kernels where they have no easy ability to create
36864 // aliases and are doing non-trivial configuration of the thunk's body. For
36865 // example, the Linux kernel will do boot-time hot patching of the thunk
36866 // bodies and cannot easily export aliases of these to loaded modules.
36867 //
36868 // Note that at any point in the future, we may need to change the semantics
36869 // of how we implement retpolines and at that time will likely change the
36870 // name of the called thunk. Essentially, there is no hard guarantee that
36871 // LLVM will generate calls to specific thunks, we merely make a best-effort
36872 // attempt to help out kernels and other systems where duplicating the
36873 // thunks is costly.
36874 switch (Reg.id()) {
36875 case X86::EAX:
36876 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36877 return "__x86_indirect_thunk_eax";
36878 case X86::ECX:
36879 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36880 return "__x86_indirect_thunk_ecx";
36881 case X86::EDX:
36882 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36883 return "__x86_indirect_thunk_edx";
36884 case X86::EDI:
36885 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36886 return "__x86_indirect_thunk_edi";
36887 case X86::R11:
36888 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36889 return "__x86_indirect_thunk_r11";
36890 }
36891 llvm_unreachable("unexpected reg for external indirect thunk");
36892 }
36893
36894 if (Subtarget.useRetpolineIndirectCalls() ||
36895 Subtarget.useRetpolineIndirectBranches()) {
36896 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36897 switch (Reg.id()) {
36898 case X86::EAX:
36899 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36900 return "__llvm_retpoline_eax";
36901 case X86::ECX:
36902 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36903 return "__llvm_retpoline_ecx";
36904 case X86::EDX:
36905 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36906 return "__llvm_retpoline_edx";
36907 case X86::EDI:
36908 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36909 return "__llvm_retpoline_edi";
36910 case X86::R11:
36911 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36912 return "__llvm_retpoline_r11";
36913 }
36914 llvm_unreachable("unexpected reg for retpoline");
36915 }
36916
36917 if (Subtarget.useLVIControlFlowIntegrity()) {
36918 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36919 return "__llvm_lvi_thunk_r11";
36920 }
36921 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36922}
36923
36925X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36926 MachineBasicBlock *BB) const {
36927 // Copy the virtual register into the R11 physical register and
36928 // call the retpoline thunk.
36929 const MIMetadata MIMD(MI);
36930 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36931 Register CalleeVReg = MI.getOperand(0).getReg();
36932 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36933
36934 // Find an available scratch register to hold the callee. On 64-bit, we can
36935 // just use R11, but we scan for uses anyway to ensure we don't generate
36936 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36937 // already a register use operand to the call to hold the callee. If none
36938 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36939 // register and ESI is the base pointer to realigned stack frames with VLAs.
36940 SmallVector<Register, 3> AvailableRegs;
36941 if (Subtarget.is64Bit())
36942 AvailableRegs.push_back(X86::R11);
36943 else
36944 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36945
36946 // Zero out any registers that are already used.
36947 for (const auto &MO : MI.operands()) {
36948 if (MO.isReg() && MO.isUse())
36949 llvm::replace(AvailableRegs, MO.getReg(), Register());
36950 }
36951
36952 // Choose the first remaining non-zero available register.
36953 Register AvailableReg;
36954 for (Register MaybeReg : AvailableRegs) {
36955 if (MaybeReg) {
36956 AvailableReg = MaybeReg;
36957 break;
36958 }
36959 }
36960 if (!AvailableReg)
36961 report_fatal_error("calling convention incompatible with retpoline, no "
36962 "available registers");
36963
36964 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36965
36966 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36967 .addReg(CalleeVReg);
36968 MI.getOperand(0).ChangeToES(Symbol);
36969 MI.setDesc(TII->get(Opc));
36970 MachineInstrBuilder(*BB->getParent(), &MI)
36971 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36972 return BB;
36973}
36974
36975/// SetJmp implies future control flow change upon calling the corresponding
36976/// LongJmp.
36977/// Instead of using the 'return' instruction, the long jump fixes the stack and
36978/// performs an indirect branch. To do so it uses the registers that were stored
36979/// in the jump buffer (when calling SetJmp).
36980/// In case the shadow stack is enabled we need to fix it as well, because some
36981/// return addresses will be skipped.
36982/// The function will save the SSP for future fixing in the function
36983/// emitLongJmpShadowStackFix.
36984/// \sa emitLongJmpShadowStackFix
36985/// \param [in] MI The temporary Machine Instruction for the builtin.
36986/// \param [in] MBB The Machine Basic Block that will be modified.
36987void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36988 MachineBasicBlock *MBB) const {
36989 const MIMetadata MIMD(MI);
36990 MachineFunction *MF = MBB->getParent();
36991 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36992 MachineRegisterInfo &MRI = MF->getRegInfo();
36993 MachineInstrBuilder MIB;
36994
36995 // Memory Reference.
36996 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36997
36998 // Initialize a register with zero.
36999 MVT PVT = getPointerTy(MF->getDataLayout());
37000 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37001 Register ZReg = MRI.createVirtualRegister(PtrRC);
37002 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37003 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37004 .addDef(ZReg)
37005 .addReg(ZReg, RegState::Undef)
37006 .addReg(ZReg, RegState::Undef);
37007
37008 // Read the current SSP Register value to the zeroed register.
37009 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37010 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37011 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37012
37013 // Write the SSP register value to offset 3 in input memory buffer.
37014 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37015 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37016 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37017 const unsigned MemOpndSlot = 1;
37018 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37019 if (i == X86::AddrDisp)
37020 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37021 else
37022 MIB.add(MI.getOperand(MemOpndSlot + i));
37023 }
37024 MIB.addReg(SSPCopyReg);
37025 MIB.setMemRefs(MMOs);
37026}
37027
37029X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37030 MachineBasicBlock *MBB) const {
37031 const MIMetadata MIMD(MI);
37032 MachineFunction *MF = MBB->getParent();
37033 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37034 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37035 MachineRegisterInfo &MRI = MF->getRegInfo();
37036
37037 const BasicBlock *BB = MBB->getBasicBlock();
37039
37040 // Memory Reference
37041 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37042
37043 unsigned MemOpndSlot = 0;
37044
37045 unsigned CurOp = 0;
37046
37047 Register DstReg = MI.getOperand(CurOp++).getReg();
37048 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37049 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37050 (void)TRI;
37051 Register mainDstReg = MRI.createVirtualRegister(RC);
37052 Register restoreDstReg = MRI.createVirtualRegister(RC);
37053
37054 MemOpndSlot = CurOp;
37055
37056 MVT PVT = getPointerTy(MF->getDataLayout());
37057 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37058 "Invalid Pointer Size!");
37059
37060 // For v = setjmp(buf), we generate
37061 //
37062 // thisMBB:
37063 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37064 // SjLjSetup restoreMBB
37065 //
37066 // mainMBB:
37067 // v_main = 0
37068 //
37069 // sinkMBB:
37070 // v = phi(main, restore)
37071 //
37072 // restoreMBB:
37073 // if base pointer being used, load it from frame
37074 // v_restore = 1
37075
37076 MachineBasicBlock *thisMBB = MBB;
37077 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37078 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37079 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37080 MF->insert(I, mainMBB);
37081 MF->insert(I, sinkMBB);
37082 MF->push_back(restoreMBB);
37083 restoreMBB->setMachineBlockAddressTaken();
37084
37085 MachineInstrBuilder MIB;
37086
37087 // Transfer the remainder of BB and its successor edges to sinkMBB.
37088 sinkMBB->splice(sinkMBB->begin(), MBB,
37089 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37091
37092 // thisMBB:
37093 unsigned PtrStoreOpc = 0;
37094 Register LabelReg;
37095 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37096 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37098
37099 // Prepare IP either in reg or imm.
37100 if (!UseImmLabel) {
37101 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37102 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37103 LabelReg = MRI.createVirtualRegister(PtrRC);
37104 if (Subtarget.is64Bit()) {
37105 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37106 .addReg(X86::RIP)
37107 .addImm(0)
37108 .addReg(0)
37109 .addMBB(restoreMBB)
37110 .addReg(0);
37111 } else {
37112 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37113 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37114 .addReg(XII->getGlobalBaseReg(MF))
37115 .addImm(0)
37116 .addReg(0)
37117 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37118 .addReg(0);
37119 }
37120 } else
37121 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37122 // Store IP
37123 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37124 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37125 if (i == X86::AddrDisp)
37126 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37127 else
37128 MIB.add(MI.getOperand(MemOpndSlot + i));
37129 }
37130 if (!UseImmLabel)
37131 MIB.addReg(LabelReg);
37132 else
37133 MIB.addMBB(restoreMBB);
37134 MIB.setMemRefs(MMOs);
37135
37136 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37137 emitSetJmpShadowStackFix(MI, thisMBB);
37138 }
37139
37140 // Setup
37141 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37142 .addMBB(restoreMBB);
37143
37144 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37145 MIB.addRegMask(RegInfo->getNoPreservedMask());
37146 thisMBB->addSuccessor(mainMBB);
37147 thisMBB->addSuccessor(restoreMBB);
37148
37149 // mainMBB:
37150 // EAX = 0
37151 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37152 mainMBB->addSuccessor(sinkMBB);
37153
37154 // sinkMBB:
37155 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37156 .addReg(mainDstReg)
37157 .addMBB(mainMBB)
37158 .addReg(restoreDstReg)
37159 .addMBB(restoreMBB);
37160
37161 // restoreMBB:
37162 if (RegInfo->hasBasePointer(*MF)) {
37163 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37164 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37165 X86FI->setRestoreBasePointer(MF);
37166 Register FramePtr = RegInfo->getFrameRegister(*MF);
37167 Register BasePtr = RegInfo->getBaseRegister();
37168 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37169 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37170 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37172 }
37173 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37174 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37175 restoreMBB->addSuccessor(sinkMBB);
37176
37177 MI.eraseFromParent();
37178 return sinkMBB;
37179}
37180
37181/// Fix the shadow stack using the previously saved SSP pointer.
37182/// \sa emitSetJmpShadowStackFix
37183/// \param [in] MI The temporary Machine Instruction for the builtin.
37184/// \param [in] MBB The Machine Basic Block that will be modified.
37185/// \return The sink MBB that will perform the future indirect branch.
37187X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37188 MachineBasicBlock *MBB) const {
37189 const MIMetadata MIMD(MI);
37190 MachineFunction *MF = MBB->getParent();
37191 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37192 MachineRegisterInfo &MRI = MF->getRegInfo();
37193
37194 // Memory Reference
37195 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37196
37197 MVT PVT = getPointerTy(MF->getDataLayout());
37198 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37199
37200 // checkSspMBB:
37201 // xor vreg1, vreg1
37202 // rdssp vreg1
37203 // test vreg1, vreg1
37204 // je sinkMBB # Jump if Shadow Stack is not supported
37205 // fallMBB:
37206 // mov buf+24/12(%rip), vreg2
37207 // sub vreg1, vreg2
37208 // jbe sinkMBB # No need to fix the Shadow Stack
37209 // fixShadowMBB:
37210 // shr 3/2, vreg2
37211 // incssp vreg2 # fix the SSP according to the lower 8 bits
37212 // shr 8, vreg2
37213 // je sinkMBB
37214 // fixShadowLoopPrepareMBB:
37215 // shl vreg2
37216 // mov 128, vreg3
37217 // fixShadowLoopMBB:
37218 // incssp vreg3
37219 // dec vreg2
37220 // jne fixShadowLoopMBB # Iterate until you finish fixing
37221 // # the Shadow Stack
37222 // sinkMBB:
37223
37225 const BasicBlock *BB = MBB->getBasicBlock();
37226
37227 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37229 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37230 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37231 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37232 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37233 MF->insert(I, checkSspMBB);
37234 MF->insert(I, fallMBB);
37235 MF->insert(I, fixShadowMBB);
37236 MF->insert(I, fixShadowLoopPrepareMBB);
37237 MF->insert(I, fixShadowLoopMBB);
37238 MF->insert(I, sinkMBB);
37239
37240 // Transfer the remainder of BB and its successor edges to sinkMBB.
37241 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37242 MBB->end());
37244
37245 MBB->addSuccessor(checkSspMBB);
37246
37247 // Initialize a register with zero.
37248 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37249 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37250
37251 if (PVT == MVT::i64) {
37252 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37253 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37254 .addImm(0)
37255 .addReg(ZReg)
37256 .addImm(X86::sub_32bit);
37257 ZReg = TmpZReg;
37258 }
37259
37260 // Read the current SSP Register value to the zeroed register.
37261 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37262 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37263 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37264
37265 // Check whether the result of the SSP register is zero and jump directly
37266 // to the sink.
37267 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37268 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37269 .addReg(SSPCopyReg)
37270 .addReg(SSPCopyReg);
37271 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37272 .addMBB(sinkMBB)
37274 checkSspMBB->addSuccessor(sinkMBB);
37275 checkSspMBB->addSuccessor(fallMBB);
37276
37277 // Reload the previously saved SSP register value.
37278 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37279 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37280 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37281 MachineInstrBuilder MIB =
37282 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37283 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37284 const MachineOperand &MO = MI.getOperand(i);
37285 if (i == X86::AddrDisp)
37286 MIB.addDisp(MO, SPPOffset);
37287 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37288 // preserve kill flags.
37289 MIB.addReg(MO.getReg());
37290 else
37291 MIB.add(MO);
37292 }
37293 MIB.setMemRefs(MMOs);
37294
37295 // Subtract the current SSP from the previous SSP.
37296 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37297 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37298 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37299 .addReg(PrevSSPReg)
37300 .addReg(SSPCopyReg);
37301
37302 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37303 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37304 .addMBB(sinkMBB)
37306 fallMBB->addSuccessor(sinkMBB);
37307 fallMBB->addSuccessor(fixShadowMBB);
37308
37309 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37310 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37311 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37312 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37313 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37314 .addReg(SspSubReg)
37315 .addImm(Offset);
37316
37317 // Increase SSP when looking only on the lower 8 bits of the delta.
37318 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37319 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37320
37321 // Reset the lower 8 bits.
37322 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37323 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37324 .addReg(SspFirstShrReg)
37325 .addImm(8);
37326
37327 // Jump if the result of the shift is zero.
37328 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37329 .addMBB(sinkMBB)
37331 fixShadowMBB->addSuccessor(sinkMBB);
37332 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37333
37334 // Do a single shift left.
37335 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37336 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37337 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37338 .addReg(SspSecondShrReg)
37339 .addImm(1);
37340
37341 // Save the value 128 to a register (will be used next with incssp).
37342 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37343 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37344 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37345 .addImm(128);
37346 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37347
37348 // Since incssp only looks at the lower 8 bits, we might need to do several
37349 // iterations of incssp until we finish fixing the shadow stack.
37350 Register DecReg = MRI.createVirtualRegister(PtrRC);
37351 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37352 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37353 .addReg(SspAfterShlReg)
37354 .addMBB(fixShadowLoopPrepareMBB)
37355 .addReg(DecReg)
37356 .addMBB(fixShadowLoopMBB);
37357
37358 // Every iteration we increase the SSP by 128.
37359 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37360
37361 // Every iteration we decrement the counter by 1.
37362 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37363 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37364
37365 // Jump if the counter is not zero yet.
37366 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37367 .addMBB(fixShadowLoopMBB)
37369 fixShadowLoopMBB->addSuccessor(sinkMBB);
37370 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37371
37372 return sinkMBB;
37373}
37374
37376X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37377 MachineBasicBlock *MBB) const {
37378 const MIMetadata MIMD(MI);
37379 MachineFunction *MF = MBB->getParent();
37380 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37381 MachineRegisterInfo &MRI = MF->getRegInfo();
37382
37383 // Memory Reference
37384 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37385
37386 MVT PVT = getPointerTy(MF->getDataLayout());
37387 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37388 "Invalid Pointer Size!");
37389
37390 const TargetRegisterClass *RC =
37391 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37392 Register Tmp = MRI.createVirtualRegister(RC);
37393 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37394 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37395 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37396 Register SP = RegInfo->getStackRegister();
37397
37398 MachineInstrBuilder MIB;
37399
37400 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37401 const int64_t SPOffset = 2 * PVT.getStoreSize();
37402
37403 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37404 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37405
37406 MachineBasicBlock *thisMBB = MBB;
37407
37408 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37409 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37410 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37411 }
37412
37413 // Reload FP
37414 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37415 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37416 const MachineOperand &MO = MI.getOperand(i);
37417 if (MO.isReg()) // Don't add the whole operand, we don't want to
37418 // preserve kill flags.
37419 MIB.addReg(MO.getReg());
37420 else
37421 MIB.add(MO);
37422 }
37423 MIB.setMemRefs(MMOs);
37425
37426 // Reload IP
37427 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37428 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37429 const MachineOperand &MO = MI.getOperand(i);
37430 if (i == X86::AddrDisp)
37431 MIB.addDisp(MO, LabelOffset);
37432 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37433 // preserve kill flags.
37434 MIB.addReg(MO.getReg());
37435 else
37436 MIB.add(MO);
37437 }
37438 MIB.setMemRefs(MMOs);
37439
37440 // Reload SP
37441 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37442 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37443 if (i == X86::AddrDisp)
37444 MIB.addDisp(MI.getOperand(i), SPOffset);
37445 else
37446 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37447 // the last instruction of the expansion.
37448 }
37449 MIB.setMemRefs(MMOs);
37451
37452 // Jump
37453 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37454
37455 MI.eraseFromParent();
37456 return thisMBB;
37457}
37458
37459void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37461 MachineBasicBlock *DispatchBB,
37462 int FI) const {
37463 const MIMetadata MIMD(MI);
37464 MachineFunction *MF = MBB->getParent();
37465 MachineRegisterInfo *MRI = &MF->getRegInfo();
37466 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37467
37468 MVT PVT = getPointerTy(MF->getDataLayout());
37469 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37470
37471 unsigned Op = 0;
37472 Register VR;
37473
37474 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37476
37477 if (UseImmLabel) {
37478 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37479 } else {
37480 const TargetRegisterClass *TRC =
37481 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37482 VR = MRI->createVirtualRegister(TRC);
37483 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37484
37485 if (Subtarget.is64Bit())
37486 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37487 .addReg(X86::RIP)
37488 .addImm(1)
37489 .addReg(0)
37490 .addMBB(DispatchBB)
37491 .addReg(0);
37492 else
37493 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37494 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37495 .addImm(1)
37496 .addReg(0)
37497 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37498 .addReg(0);
37499 }
37500
37501 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37502 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37503 if (UseImmLabel)
37504 MIB.addMBB(DispatchBB);
37505 else
37506 MIB.addReg(VR);
37507}
37508
37510X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37511 MachineBasicBlock *BB) const {
37512 const MIMetadata MIMD(MI);
37513 MachineFunction *MF = BB->getParent();
37514 MachineRegisterInfo *MRI = &MF->getRegInfo();
37515 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37516 int FI = MF->getFrameInfo().getFunctionContextIndex();
37517
37518 // Get a mapping of the call site numbers to all of the landing pads they're
37519 // associated with.
37520 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37521 unsigned MaxCSNum = 0;
37522 for (auto &MBB : *MF) {
37523 if (!MBB.isEHPad())
37524 continue;
37525
37526 MCSymbol *Sym = nullptr;
37527 for (const auto &MI : MBB) {
37528 if (MI.isDebugInstr())
37529 continue;
37530
37531 assert(MI.isEHLabel() && "expected EH_LABEL");
37532 Sym = MI.getOperand(0).getMCSymbol();
37533 break;
37534 }
37535
37536 if (!MF->hasCallSiteLandingPad(Sym))
37537 continue;
37538
37539 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37540 CallSiteNumToLPad[CSI].push_back(&MBB);
37541 MaxCSNum = std::max(MaxCSNum, CSI);
37542 }
37543 }
37544
37545 // Get an ordered list of the machine basic blocks for the jump table.
37546 std::vector<MachineBasicBlock *> LPadList;
37547 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37548 LPadList.reserve(CallSiteNumToLPad.size());
37549
37550 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37551 for (auto &LP : CallSiteNumToLPad[CSI]) {
37552 LPadList.push_back(LP);
37553 InvokeBBs.insert_range(LP->predecessors());
37554 }
37555 }
37556
37557 assert(!LPadList.empty() &&
37558 "No landing pad destinations for the dispatch jump table!");
37559
37560 // Create the MBBs for the dispatch code.
37561
37562 // Shove the dispatch's address into the return slot in the function context.
37563 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37564 DispatchBB->setIsEHPad(true);
37565
37566 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37567 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37568 DispatchBB->addSuccessor(TrapBB);
37569
37570 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37571 DispatchBB->addSuccessor(DispContBB);
37572
37573 // Insert MBBs.
37574 MF->push_back(DispatchBB);
37575 MF->push_back(DispContBB);
37576 MF->push_back(TrapBB);
37577
37578 // Insert code into the entry block that creates and registers the function
37579 // context.
37580 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37581
37582 // Create the jump table and associated information
37583 unsigned JTE = getJumpTableEncoding();
37584 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37585 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37586
37587 const X86RegisterInfo &RI = TII->getRegisterInfo();
37588 // Add a register mask with no preserved registers. This results in all
37589 // registers being marked as clobbered.
37590 if (RI.hasBasePointer(*MF)) {
37591 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37592 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37593 MFI->setRestoreBasePointer(MF);
37594
37595 Register FP = RI.getFrameRegister(*MF);
37596 Register BP = RI.getBaseRegister();
37597 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37598 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37601 } else {
37602 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37604 }
37605
37606 // IReg is used as an index in a memory operand and therefore can't be SP
37607 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37608 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37609 Subtarget.is64Bit() ? 8 : 4);
37610 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37611 .addReg(IReg)
37612 .addImm(LPadList.size());
37613 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37614 .addMBB(TrapBB)
37616
37617 if (Subtarget.is64Bit()) {
37618 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37619 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37620
37621 // leaq .LJTI0_0(%rip), BReg
37622 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37623 .addReg(X86::RIP)
37624 .addImm(1)
37625 .addReg(0)
37626 .addJumpTableIndex(MJTI)
37627 .addReg(0);
37628 // movzx IReg64, IReg
37629 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37630 .addImm(0)
37631 .addReg(IReg)
37632 .addImm(X86::sub_32bit);
37633
37634 switch (JTE) {
37636 // jmpq *(BReg,IReg64,8)
37637 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37638 .addReg(BReg)
37639 .addImm(8)
37640 .addReg(IReg64)
37641 .addImm(0)
37642 .addReg(0);
37643 break;
37645 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37646 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37647 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37648
37649 // movl (BReg,IReg64,4), OReg
37650 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37651 .addReg(BReg)
37652 .addImm(4)
37653 .addReg(IReg64)
37654 .addImm(0)
37655 .addReg(0);
37656 // movsx OReg64, OReg
37657 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37658 .addReg(OReg);
37659 // addq BReg, OReg64, TReg
37660 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37661 .addReg(OReg64)
37662 .addReg(BReg);
37663 // jmpq *TReg
37664 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37665 break;
37666 }
37667 default:
37668 llvm_unreachable("Unexpected jump table encoding");
37669 }
37670 } else {
37671 // jmpl *.LJTI0_0(,IReg,4)
37672 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37673 .addReg(0)
37674 .addImm(4)
37675 .addReg(IReg)
37676 .addJumpTableIndex(MJTI)
37677 .addReg(0);
37678 }
37679
37680 // Add the jump table entries as successors to the MBB.
37681 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37682 for (auto &LP : LPadList)
37683 if (SeenMBBs.insert(LP).second)
37684 DispContBB->addSuccessor(LP);
37685
37686 // N.B. the order the invoke BBs are processed in doesn't matter here.
37688 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37689 for (MachineBasicBlock *MBB : InvokeBBs) {
37690 // Remove the landing pad successor from the invoke block and replace it
37691 // with the new dispatch block.
37692 // Keep a copy of Successors since it's modified inside the loop.
37693 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37694 MBB->succ_rend());
37695 // FIXME: Avoid quadratic complexity.
37696 for (auto *MBBS : Successors) {
37697 if (MBBS->isEHPad()) {
37698 MBB->removeSuccessor(MBBS);
37699 MBBLPads.push_back(MBBS);
37700 }
37701 }
37702
37703 MBB->addSuccessor(DispatchBB);
37704
37705 // Find the invoke call and mark all of the callee-saved registers as
37706 // 'implicit defined' so that they're spilled. This prevents code from
37707 // moving instructions to before the EH block, where they will never be
37708 // executed.
37709 for (auto &II : reverse(*MBB)) {
37710 if (!II.isCall())
37711 continue;
37712
37713 DenseSet<Register> DefRegs;
37714 for (auto &MOp : II.operands())
37715 if (MOp.isReg())
37716 DefRegs.insert(MOp.getReg());
37717
37718 MachineInstrBuilder MIB(*MF, &II);
37719 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37720 Register Reg = SavedRegs[RegIdx];
37721 if (!DefRegs.contains(Reg))
37723 }
37724
37725 break;
37726 }
37727 }
37728
37729 // Mark all former landing pads as non-landing pads. The dispatch is the only
37730 // landing pad now.
37731 for (auto &LP : MBBLPads)
37732 LP->setIsEHPad(false);
37733
37734 // The instruction is gone now.
37735 MI.eraseFromParent();
37736 return BB;
37737}
37738
37740X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37741 MachineBasicBlock *BB) const {
37742 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37743 // calls may require proper stack alignment.
37744 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37745 const MIMetadata MIMD(MI);
37746 MachineFunction &MF = *BB->getParent();
37747
37748 // Emit CALLSEQ_START right before the instruction.
37749 MF.getFrameInfo().setAdjustsStack(true);
37750 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37751 MachineInstrBuilder CallseqStart =
37752 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37753 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37754
37755 // Emit CALLSEQ_END right after the instruction.
37756 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37757 MachineInstrBuilder CallseqEnd =
37758 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37759 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37760
37761 return BB;
37762}
37763
37766 MachineBasicBlock *BB) const {
37767 MachineFunction *MF = BB->getParent();
37768 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37769 const MIMetadata MIMD(MI);
37770
37771 auto TMMImmToTMMReg = [](unsigned Imm) {
37772 assert (Imm < 8 && "Illegal tmm index");
37773 return X86::TMM0 + Imm;
37774 };
37775 auto TMMImmToTMMPair = [](unsigned Imm) {
37776 assert(Imm < 8 && "Illegal tmm pair index.");
37777 return X86::TMM0_TMM1 + Imm / 2;
37778 };
37779 switch (MI.getOpcode()) {
37780 default:
37781 llvm_unreachable("Unexpected instr type to insert");
37782 case X86::INDIRECT_THUNK_CALL32:
37783 case X86::INDIRECT_THUNK_CALL64:
37784 case X86::INDIRECT_THUNK_TCRETURN32:
37785 case X86::INDIRECT_THUNK_TCRETURN64:
37786 return EmitLoweredIndirectThunk(MI, BB);
37787 case X86::CATCHRET:
37788 return EmitLoweredCatchRet(MI, BB);
37789 case X86::SEG_ALLOCA_32:
37790 case X86::SEG_ALLOCA_64:
37791 return EmitLoweredSegAlloca(MI, BB);
37792 case X86::PROBED_ALLOCA_32:
37793 case X86::PROBED_ALLOCA_64:
37794 return EmitLoweredProbedAlloca(MI, BB);
37795 case X86::TLSCall_32:
37796 case X86::TLSCall_64:
37797 return EmitLoweredTLSCall(MI, BB);
37798 case X86::CMOV_FR16:
37799 case X86::CMOV_FR16X:
37800 case X86::CMOV_FR32:
37801 case X86::CMOV_FR32X:
37802 case X86::CMOV_FR64:
37803 case X86::CMOV_FR64X:
37804 case X86::CMOV_GR8:
37805 case X86::CMOV_GR16:
37806 case X86::CMOV_GR32:
37807 case X86::CMOV_RFP32:
37808 case X86::CMOV_RFP64:
37809 case X86::CMOV_RFP80:
37810 case X86::CMOV_VR64:
37811 case X86::CMOV_VR128:
37812 case X86::CMOV_VR128X:
37813 case X86::CMOV_VR256:
37814 case X86::CMOV_VR256X:
37815 case X86::CMOV_VR512:
37816 case X86::CMOV_VK1:
37817 case X86::CMOV_VK2:
37818 case X86::CMOV_VK4:
37819 case X86::CMOV_VK8:
37820 case X86::CMOV_VK16:
37821 case X86::CMOV_VK32:
37822 case X86::CMOV_VK64:
37823 return EmitLoweredSelect(MI, BB);
37824
37825 case X86::FP80_ADDr:
37826 case X86::FP80_ADDm32: {
37827 // Change the floating point control register to use double extended
37828 // precision when performing the addition.
37829 int OrigCWFrameIdx =
37830 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37831 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37832 OrigCWFrameIdx);
37833
37834 // Load the old value of the control word...
37835 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37836 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37837 OrigCWFrameIdx);
37838
37839 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37840 // precision.
37841 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37842 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37843 .addReg(OldCW, RegState::Kill)
37844 .addImm(0x300);
37845
37846 // Extract to 16 bits.
37847 Register NewCW16 =
37848 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37849 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37850 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37851
37852 // Prepare memory for FLDCW.
37853 int NewCWFrameIdx =
37854 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37855 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37856 NewCWFrameIdx)
37857 .addReg(NewCW16, RegState::Kill);
37858
37859 // Reload the modified control word now...
37860 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37861 NewCWFrameIdx);
37862
37863 // Do the addition.
37864 if (MI.getOpcode() == X86::FP80_ADDr) {
37865 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37866 .add(MI.getOperand(0))
37867 .add(MI.getOperand(1))
37868 .add(MI.getOperand(2));
37869 } else {
37870 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37871 .add(MI.getOperand(0))
37872 .add(MI.getOperand(1))
37873 .add(MI.getOperand(2))
37874 .add(MI.getOperand(3))
37875 .add(MI.getOperand(4))
37876 .add(MI.getOperand(5))
37877 .add(MI.getOperand(6));
37878 }
37879
37880 // Reload the original control word now.
37881 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37882 OrigCWFrameIdx);
37883
37884 MI.eraseFromParent(); // The pseudo instruction is gone now.
37885 return BB;
37886 }
37887
37888 case X86::FP32_TO_INT16_IN_MEM:
37889 case X86::FP32_TO_INT32_IN_MEM:
37890 case X86::FP32_TO_INT64_IN_MEM:
37891 case X86::FP64_TO_INT16_IN_MEM:
37892 case X86::FP64_TO_INT32_IN_MEM:
37893 case X86::FP64_TO_INT64_IN_MEM:
37894 case X86::FP80_TO_INT16_IN_MEM:
37895 case X86::FP80_TO_INT32_IN_MEM:
37896 case X86::FP80_TO_INT64_IN_MEM: {
37897 // Change the floating point control register to use "round towards zero"
37898 // mode when truncating to an integer value.
37899 int OrigCWFrameIdx =
37900 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37901 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37902 OrigCWFrameIdx);
37903
37904 // Load the old value of the control word...
37905 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37906 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37907 OrigCWFrameIdx);
37908
37909 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37910 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37911 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37912 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37913
37914 // Extract to 16 bits.
37915 Register NewCW16 =
37916 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37917 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37918 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37919
37920 // Prepare memory for FLDCW.
37921 int NewCWFrameIdx =
37922 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37923 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37924 NewCWFrameIdx)
37925 .addReg(NewCW16, RegState::Kill);
37926
37927 // Reload the modified control word now...
37928 addFrameReference(BuildMI(*BB, MI, MIMD,
37929 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37930
37931 // Get the X86 opcode to use.
37932 unsigned Opc;
37933 switch (MI.getOpcode()) {
37934 // clang-format off
37935 default: llvm_unreachable("illegal opcode!");
37936 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37937 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37938 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37939 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37940 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37941 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37942 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37943 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37944 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37945 // clang-format on
37946 }
37947
37949 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37950 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37951
37952 // Reload the original control word now.
37953 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37954 OrigCWFrameIdx);
37955
37956 MI.eraseFromParent(); // The pseudo instruction is gone now.
37957 return BB;
37958 }
37959
37960 // xbegin
37961 case X86::XBEGIN:
37962 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37963
37964 case X86::VAARG_64:
37965 case X86::VAARG_X32:
37966 return EmitVAARGWithCustomInserter(MI, BB);
37967
37968 case X86::EH_SjLj_SetJmp32:
37969 case X86::EH_SjLj_SetJmp64:
37970 return emitEHSjLjSetJmp(MI, BB);
37971
37972 case X86::EH_SjLj_LongJmp32:
37973 case X86::EH_SjLj_LongJmp64:
37974 return emitEHSjLjLongJmp(MI, BB);
37975
37976 case X86::Int_eh_sjlj_setup_dispatch:
37977 return EmitSjLjDispatchBlock(MI, BB);
37978
37979 case TargetOpcode::STATEPOINT:
37980 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37981 // this point in the process. We diverge later.
37982 return emitPatchPoint(MI, BB);
37983
37984 case TargetOpcode::STACKMAP:
37985 case TargetOpcode::PATCHPOINT:
37986 return emitPatchPoint(MI, BB);
37987
37988 case TargetOpcode::PATCHABLE_EVENT_CALL:
37989 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37990 return emitPatchableEventCall(MI, BB);
37991
37992 case X86::LCMPXCHG8B: {
37993 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37994 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37995 // requires a memory operand. If it happens that current architecture is
37996 // i686 and for current function we need a base pointer
37997 // - which is ESI for i686 - register allocator would not be able to
37998 // allocate registers for an address in form of X(%reg, %reg, Y)
37999 // - there never would be enough unreserved registers during regalloc
38000 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38001 // We are giving a hand to register allocator by precomputing the address in
38002 // a new vreg using LEA.
38003
38004 // If it is not i686 or there is no base pointer - nothing to do here.
38005 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38006 return BB;
38007
38008 // Even though this code does not necessarily needs the base pointer to
38009 // be ESI, we check for that. The reason: if this assert fails, there are
38010 // some changes happened in the compiler base pointer handling, which most
38011 // probably have to be addressed somehow here.
38012 assert(TRI->getBaseRegister() == X86::ESI &&
38013 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38014 "base pointer in mind");
38015
38017 MVT SPTy = getPointerTy(MF->getDataLayout());
38018 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38019 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38020
38022 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38023 // does not use index register.
38024 if (AM.IndexReg == X86::NoRegister)
38025 return BB;
38026
38027 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38028 // four operand definitions that are E[ABCD] registers. We skip them and
38029 // then insert the LEA.
38030 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38031 while (RMBBI != BB->rend() &&
38032 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38033 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38034 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38035 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38036 ++RMBBI;
38037 }
38040 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38041
38042 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38043
38044 return BB;
38045 }
38046 case X86::LCMPXCHG16B_NO_RBX: {
38047 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38048 Register BasePtr = TRI->getBaseRegister();
38049 if (TRI->hasBasePointer(*MF) &&
38050 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38051 if (!BB->isLiveIn(BasePtr))
38052 BB->addLiveIn(BasePtr);
38053 // Save RBX into a virtual register.
38054 Register SaveRBX =
38055 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38056 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38057 .addReg(X86::RBX);
38058 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38060 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38061 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38062 MIB.add(MI.getOperand(Idx));
38063 MIB.add(MI.getOperand(X86::AddrNumOperands));
38064 MIB.addReg(SaveRBX);
38065 } else {
38066 // Simple case, just copy the virtual register to RBX.
38067 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38068 .add(MI.getOperand(X86::AddrNumOperands));
38070 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38071 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38072 MIB.add(MI.getOperand(Idx));
38073 }
38074 MI.eraseFromParent();
38075 return BB;
38076 }
38077 case X86::MWAITX: {
38078 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38079 Register BasePtr = TRI->getBaseRegister();
38080 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38081 // If no need to save the base pointer, we generate MWAITXrrr,
38082 // else we generate pseudo MWAITX_SAVE_RBX.
38083 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38085 .addReg(MI.getOperand(0).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38087 .addReg(MI.getOperand(1).getReg());
38088 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38089 .addReg(MI.getOperand(2).getReg());
38090 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38091 MI.eraseFromParent();
38092 } else {
38093 if (!BB->isLiveIn(BasePtr)) {
38094 BB->addLiveIn(BasePtr);
38095 }
38096 // Parameters can be copied into ECX and EAX but not EBX yet.
38097 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38098 .addReg(MI.getOperand(0).getReg());
38099 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38100 .addReg(MI.getOperand(1).getReg());
38101 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38102 // Save RBX into a virtual register.
38103 Register SaveRBX =
38104 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38105 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38106 .addReg(X86::RBX);
38107 // Generate mwaitx pseudo.
38108 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38109 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38110 .addDef(Dst) // Destination tied in with SaveRBX.
38111 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38112 .addUse(SaveRBX); // Save of base pointer.
38113 MI.eraseFromParent();
38114 }
38115 return BB;
38116 }
38117 case TargetOpcode::PREALLOCATED_SETUP: {
38118 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38119 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38120 MFI->setHasPreallocatedCall(true);
38121 int64_t PreallocatedId = MI.getOperand(0).getImm();
38122 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38123 assert(StackAdjustment != 0 && "0 stack adjustment");
38124 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38125 << StackAdjustment << "\n");
38126 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38127 .addReg(X86::ESP)
38128 .addImm(StackAdjustment);
38129 MI.eraseFromParent();
38130 return BB;
38131 }
38132 case TargetOpcode::PREALLOCATED_ARG: {
38133 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38134 int64_t PreallocatedId = MI.getOperand(1).getImm();
38135 int64_t ArgIdx = MI.getOperand(2).getImm();
38136 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38137 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38138 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38139 << ", arg offset " << ArgOffset << "\n");
38140 // stack pointer + offset
38141 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38142 MI.getOperand(0).getReg()),
38143 X86::ESP, false, ArgOffset);
38144 MI.eraseFromParent();
38145 return BB;
38146 }
38147 case X86::PTDPBSSD:
38148 case X86::PTDPBSUD:
38149 case X86::PTDPBUSD:
38150 case X86::PTDPBUUD:
38151 case X86::PTDPBF16PS:
38152 case X86::PTDPFP16PS:
38153 case X86::PTCMMIMFP16PS:
38154 case X86::PTCMMRLFP16PS:
38155 case X86::PTDPBF8PS:
38156 case X86::PTDPBHF8PS:
38157 case X86::PTDPHBF8PS:
38158 case X86::PTDPHF8PS:
38159 case X86::PTTDPBF16PS:
38160 case X86::PTTDPFP16PS:
38161 case X86::PTTCMMIMFP16PS:
38162 case X86::PTTCMMRLFP16PS:
38163 case X86::PTCONJTCMMIMFP16PS:
38164 case X86::PTMMULTF32PS:
38165 case X86::PTTMMULTF32PS: {
38166 unsigned Opc;
38167 switch (MI.getOpcode()) {
38168 default: llvm_unreachable("illegal opcode!");
38169 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38170 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38171 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38172 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38173 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38174 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38175 case X86::PTCMMIMFP16PS:
38176 Opc = X86::TCMMIMFP16PS;
38177 break;
38178 case X86::PTCMMRLFP16PS:
38179 Opc = X86::TCMMRLFP16PS;
38180 break;
38181 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38182 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38183 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38184 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38185 case X86::PTTDPBF16PS:
38186 Opc = X86::TTDPBF16PS;
38187 break;
38188 case X86::PTTDPFP16PS:
38189 Opc = X86::TTDPFP16PS;
38190 break;
38191 case X86::PTTCMMIMFP16PS:
38192 Opc = X86::TTCMMIMFP16PS;
38193 break;
38194 case X86::PTTCMMRLFP16PS:
38195 Opc = X86::TTCMMRLFP16PS;
38196 break;
38197 case X86::PTCONJTCMMIMFP16PS:
38198 Opc = X86::TCONJTCMMIMFP16PS;
38199 break;
38200 case X86::PTMMULTF32PS:
38201 Opc = X86::TMMULTF32PS;
38202 break;
38203 case X86::PTTMMULTF32PS:
38204 Opc = X86::TTMMULTF32PS;
38205 break;
38206 }
38207
38208 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38209 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38210 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38211 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38212 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38213
38214 MI.eraseFromParent(); // The pseudo is gone now.
38215 return BB;
38216 }
38217 case X86::PTILEZERO: {
38218 unsigned Imm = MI.getOperand(0).getImm();
38219 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38220 MI.eraseFromParent(); // The pseudo is gone now.
38221 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38223 return BB;
38224 }
38225 case X86::PTILEZEROV: {
38226 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38228 return BB;
38229 }
38230 case X86::PTILELOADDRS:
38231 case X86::PTILELOADDRST1:
38232 case X86::PTILELOADD:
38233 case X86::PTILELOADDT1:
38234 case X86::PTILESTORED: {
38235 unsigned Opc;
38236 switch (MI.getOpcode()) {
38237 default: llvm_unreachable("illegal opcode!");
38238#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38239 case X86::PTILELOADD:
38240 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38241 break;
38242 case X86::PTILELOADDT1:
38243 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38244 break;
38245 case X86::PTILESTORED:
38246 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38247 break;
38248 case X86::PTILELOADDRS:
38249 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38250 break;
38251 case X86::PTILELOADDRST1:
38252 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38253 break;
38254 }
38255#undef GET_EGPR_IF_ENABLED
38256
38257 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38258 unsigned CurOp = 0;
38259 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38260 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38262
38263 MIB.add(MI.getOperand(CurOp++)); // base
38264 MIB.add(MI.getOperand(CurOp++)); // scale
38265 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38266 MIB.add(MI.getOperand(CurOp++)); // displacement
38267 MIB.add(MI.getOperand(CurOp++)); // segment
38268
38269 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38270 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38272
38273 MI.eraseFromParent(); // The pseudo is gone now.
38274 return BB;
38275 }
38276 case X86::PT2RPNTLVWZ0:
38277 case X86::PT2RPNTLVWZ0T1:
38278 case X86::PT2RPNTLVWZ1:
38279 case X86::PT2RPNTLVWZ1T1:
38280 case X86::PT2RPNTLVWZ0RS:
38281 case X86::PT2RPNTLVWZ0RST1:
38282 case X86::PT2RPNTLVWZ1RS:
38283 case X86::PT2RPNTLVWZ1RST1: {
38284 const DebugLoc &DL = MI.getDebugLoc();
38285 unsigned Opc;
38286#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38287 switch (MI.getOpcode()) {
38288 default:
38289 llvm_unreachable("Unexpected instruction!");
38290 case X86::PT2RPNTLVWZ0:
38291 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38292 break;
38293 case X86::PT2RPNTLVWZ0T1:
38294 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38295 break;
38296 case X86::PT2RPNTLVWZ1:
38297 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38298 break;
38299 case X86::PT2RPNTLVWZ1T1:
38300 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38301 break;
38302 case X86::PT2RPNTLVWZ0RS:
38303 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38304 break;
38305 case X86::PT2RPNTLVWZ0RST1:
38306 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38307 break;
38308 case X86::PT2RPNTLVWZ1RS:
38309 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38310 break;
38311 case X86::PT2RPNTLVWZ1RST1:
38312 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38313 break;
38314 }
38315#undef GET_EGPR_IF_ENABLED
38316 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38317 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38318
38319 MIB.add(MI.getOperand(1)); // base
38320 MIB.add(MI.getOperand(2)); // scale
38321 MIB.add(MI.getOperand(3)); // index
38322 MIB.add(MI.getOperand(4)); // displacement
38323 MIB.add(MI.getOperand(5)); // segment
38324 MI.eraseFromParent(); // The pseudo is gone now.
38325 return BB;
38326 }
38327 case X86::PTTRANSPOSED:
38328 case X86::PTCONJTFP16: {
38329 const DebugLoc &DL = MI.getDebugLoc();
38330 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38331 : X86::TCONJTFP16;
38332
38333 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38334 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38335 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38336
38337 MI.eraseFromParent(); // The pseudo is gone now.
38338 return BB;
38339 }
38340 case X86::PTCVTROWPS2BF16Hrri:
38341 case X86::PTCVTROWPS2BF16Lrri:
38342 case X86::PTCVTROWPS2PHHrri:
38343 case X86::PTCVTROWPS2PHLrri:
38344 case X86::PTCVTROWD2PSrri:
38345 case X86::PTILEMOVROWrri: {
38346 const DebugLoc &DL = MI.getDebugLoc();
38347 unsigned Opc;
38348 switch (MI.getOpcode()) {
38349 default:
38350 llvm_unreachable("Unexpected instruction!");
38351 case X86::PTCVTROWD2PSrri:
38352 Opc = X86::TCVTROWD2PSrri;
38353 break;
38354 case X86::PTCVTROWPS2BF16Hrri:
38355 Opc = X86::TCVTROWPS2BF16Hrri;
38356 break;
38357 case X86::PTCVTROWPS2PHHrri:
38358 Opc = X86::TCVTROWPS2PHHrri;
38359 break;
38360 case X86::PTCVTROWPS2BF16Lrri:
38361 Opc = X86::TCVTROWPS2BF16Lrri;
38362 break;
38363 case X86::PTCVTROWPS2PHLrri:
38364 Opc = X86::TCVTROWPS2PHLrri;
38365 break;
38366 case X86::PTILEMOVROWrri:
38367 Opc = X86::TILEMOVROWrri;
38368 break;
38369 }
38370 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38371 MIB.add(MI.getOperand(0));
38372 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38373 MIB.addImm(MI.getOperand(2).getImm());
38374
38375 MI.eraseFromParent(); // The pseudo is gone now.
38376 return BB;
38377 }
38378 case X86::PTCVTROWPS2BF16Hrre:
38379 case X86::PTCVTROWPS2BF16Lrre:
38380 case X86::PTCVTROWPS2PHHrre:
38381 case X86::PTCVTROWPS2PHLrre:
38382 case X86::PTCVTROWD2PSrre:
38383 case X86::PTILEMOVROWrre: {
38384 const DebugLoc &DL = MI.getDebugLoc();
38385 unsigned Opc;
38386 switch (MI.getOpcode()) {
38387 default:
38388 llvm_unreachable("Unexpected instruction!");
38389 case X86::PTCVTROWD2PSrre:
38390 Opc = X86::TCVTROWD2PSrre;
38391 break;
38392 case X86::PTCVTROWPS2BF16Hrre:
38393 Opc = X86::TCVTROWPS2BF16Hrre;
38394 break;
38395 case X86::PTCVTROWPS2BF16Lrre:
38396 Opc = X86::TCVTROWPS2BF16Lrre;
38397 break;
38398 case X86::PTCVTROWPS2PHHrre:
38399 Opc = X86::TCVTROWPS2PHHrre;
38400 break;
38401 case X86::PTCVTROWPS2PHLrre:
38402 Opc = X86::TCVTROWPS2PHLrre;
38403 break;
38404 case X86::PTILEMOVROWrre:
38405 Opc = X86::TILEMOVROWrre;
38406 break;
38407 }
38408 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38409 MIB.add(MI.getOperand(0));
38410 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38411 MIB.add(MI.getOperand(2));
38412
38413 MI.eraseFromParent(); // The pseudo is gone now.
38414 return BB;
38415 }
38416 }
38417}
38418
38419//===----------------------------------------------------------------------===//
38420// X86 Optimization Hooks
38421//===----------------------------------------------------------------------===//
38422
38423bool
38425 const APInt &DemandedBits,
38426 const APInt &DemandedElts,
38427 TargetLoweringOpt &TLO) const {
38428 EVT VT = Op.getValueType();
38429 unsigned Opcode = Op.getOpcode();
38430 unsigned EltSize = VT.getScalarSizeInBits();
38431
38432 if (VT.isVector()) {
38433 // If the constant is only all signbits in the active bits, then we should
38434 // extend it to the entire constant to allow it act as a boolean constant
38435 // vector.
38436 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38437 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38438 return false;
38439 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38440 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38441 continue;
38442 const APInt &Val = V.getConstantOperandAPInt(i);
38443 if (Val.getBitWidth() > Val.getNumSignBits() &&
38444 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38445 return true;
38446 }
38447 return false;
38448 };
38449 // For vectors - if we have a constant, then try to sign extend.
38450 // TODO: Handle AND cases.
38451 unsigned ActiveBits = DemandedBits.getActiveBits();
38452 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38453 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38454 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38455 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38456 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38458 SDValue NewC =
38460 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38461 SDValue NewOp =
38462 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38463 return TLO.CombineTo(Op, NewOp);
38464 }
38465 return false;
38466 }
38467
38468 // Only optimize Ands to prevent shrinking a constant that could be
38469 // matched by movzx.
38470 if (Opcode != ISD::AND)
38471 return false;
38472
38473 // Make sure the RHS really is a constant.
38474 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38475 if (!C)
38476 return false;
38477
38478 const APInt &Mask = C->getAPIntValue();
38479
38480 // Clear all non-demanded bits initially.
38481 APInt ShrunkMask = Mask & DemandedBits;
38482
38483 // Find the width of the shrunk mask.
38484 unsigned Width = ShrunkMask.getActiveBits();
38485
38486 // If the mask is all 0s there's nothing to do here.
38487 if (Width == 0)
38488 return false;
38489
38490 // Find the next power of 2 width, rounding up to a byte.
38491 Width = llvm::bit_ceil(std::max(Width, 8U));
38492 // Truncate the width to size to handle illegal types.
38493 Width = std::min(Width, EltSize);
38494
38495 // Calculate a possible zero extend mask for this constant.
38496 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38497
38498 // If we aren't changing the mask, just return true to keep it and prevent
38499 // the caller from optimizing.
38500 if (ZeroExtendMask == Mask)
38501 return true;
38502
38503 // Make sure the new mask can be represented by a combination of mask bits
38504 // and non-demanded bits.
38505 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38506 return false;
38507
38508 // Replace the constant with the zero extend mask.
38509 SDLoc DL(Op);
38510 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38511 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38512 return TLO.CombineTo(Op, NewOp);
38513}
38514
38516 KnownBits &Known,
38517 const APInt &DemandedElts,
38518 const SelectionDAG &DAG, unsigned Depth) {
38519 KnownBits Known2;
38520 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38521 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38522 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38523 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38524 Known = KnownBits::abdu(Known, Known2).zext(16);
38525 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38526 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38527 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38528 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38529 Known = Known.zext(64);
38530}
38531
38533 KnownBits &Known,
38534 const APInt &DemandedElts,
38535 const SelectionDAG &DAG,
38536 unsigned Depth) {
38537 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38538
38539 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38540 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38541 APInt DemandedLoElts =
38542 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38543 APInt DemandedHiElts =
38544 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38545 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38546 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38547 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38548 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38549 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38550 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38551 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38552}
38553
38555 KnownBits &Known,
38556 const APInt &DemandedElts,
38557 const SelectionDAG &DAG,
38558 unsigned Depth) {
38559 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38560
38561 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38562 // pairs.
38563 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38564 APInt DemandedLoElts =
38565 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38566 APInt DemandedHiElts =
38567 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38568 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38569 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38570 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38571 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38572 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38573 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38574 Known = KnownBits::sadd_sat(Lo, Hi);
38575}
38576
38578 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38579 const SelectionDAG &DAG,
38580 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38581 KnownBitsFunc) {
38582 APInt DemandedEltsLHS, DemandedEltsRHS;
38583 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38584 DemandedElts, DemandedEltsLHS,
38585 DemandedEltsRHS);
38586
38587 const auto ComputeForSingleOpFunc =
38588 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38589 return KnownBitsFunc(
38590 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38591 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38592 };
38593
38594 if (DemandedEltsRHS.isZero())
38595 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38596 if (DemandedEltsLHS.isZero())
38597 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38598
38599 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38600 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38601}
38602
38604 KnownBits &Known,
38605 const APInt &DemandedElts,
38606 const SelectionDAG &DAG,
38607 unsigned Depth) const {
38608 unsigned BitWidth = Known.getBitWidth();
38609 unsigned NumElts = DemandedElts.getBitWidth();
38610 unsigned Opc = Op.getOpcode();
38611 EVT VT = Op.getValueType();
38616 "Should use MaskedValueIsZero if you don't know whether Op"
38617 " is a target node!");
38618
38619 Known.resetAll();
38620 switch (Opc) {
38621 default: break;
38622 case X86ISD::MUL_IMM: {
38623 KnownBits Known2;
38624 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38625 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38626 Known = KnownBits::mul(Known, Known2);
38627 break;
38628 }
38629 case X86ISD::BSF: {
38631
38632 KnownBits Known2;
38633 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38634 if (Known2.isNonZero()) {
38635 // If we have a known 1, its position is our upper bound.
38636 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38637 unsigned LowBits = llvm::bit_width(PossibleTZ);
38638 Known.Zero.setBitsFrom(LowBits);
38639 } else if (!Op.getOperand(0).isUndef()) {
38640 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38641 Known = Known.intersectWith(Known2);
38642 }
38643 break;
38644 }
38645 case X86ISD::BSR: {
38646 // TODO: Bound with input known bits?
38648
38649 if (!Op.getOperand(0).isUndef() &&
38650 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38651 KnownBits Known2;
38652 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38653 Known = Known.intersectWith(Known2);
38654 }
38655 break;
38656 }
38657 case X86ISD::SETCC:
38658 Known.Zero.setBitsFrom(1);
38659 break;
38660 case X86ISD::MOVMSK: {
38661 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38662 Known.Zero.setBitsFrom(NumLoBits);
38663 break;
38664 }
38665 case X86ISD::PEXTRB:
38666 case X86ISD::PEXTRW: {
38667 SDValue Src = Op.getOperand(0);
38668 EVT SrcVT = Src.getValueType();
38669 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38670 Op.getConstantOperandVal(1));
38671 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38672 Known = Known.anyextOrTrunc(BitWidth);
38673 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38674 break;
38675 }
38676 case X86ISD::VSRAI:
38677 case X86ISD::VSHLI:
38678 case X86ISD::VSRLI: {
38679 unsigned ShAmt = Op.getConstantOperandVal(1);
38680 if (ShAmt >= VT.getScalarSizeInBits()) {
38681 // Out of range logical bit shifts are guaranteed to be zero.
38682 // Out of range arithmetic bit shifts splat the sign bit.
38683 if (Opc != X86ISD::VSRAI) {
38684 Known.setAllZero();
38685 break;
38686 }
38687
38688 ShAmt = VT.getScalarSizeInBits() - 1;
38689 }
38690
38691 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38692 if (Opc == X86ISD::VSHLI) {
38693 Known <<= ShAmt;
38694 // Low bits are known zero.
38695 Known.Zero.setLowBits(ShAmt);
38696 } else if (Opc == X86ISD::VSRLI) {
38697 Known >>= ShAmt;
38698 // High bits are known zero.
38699 Known.Zero.setHighBits(ShAmt);
38700 } else {
38701 Known.Zero.ashrInPlace(ShAmt);
38702 Known.One.ashrInPlace(ShAmt);
38703 }
38704 break;
38705 }
38706 case X86ISD::PACKUS: {
38707 // PACKUS is just a truncation if the upper half is zero.
38708 APInt DemandedLHS, DemandedRHS;
38709 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38710
38711 Known.One = APInt::getAllOnes(BitWidth * 2);
38712 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38713
38714 KnownBits Known2;
38715 if (!!DemandedLHS) {
38716 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38717 Known = Known.intersectWith(Known2);
38718 }
38719 if (!!DemandedRHS) {
38720 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38721 Known = Known.intersectWith(Known2);
38722 }
38723
38724 if (Known.countMinLeadingZeros() < BitWidth)
38725 Known.resetAll();
38726 Known = Known.trunc(BitWidth);
38727 break;
38728 }
38729 case X86ISD::PSHUFB: {
38730 SDValue Src = Op.getOperand(0);
38731 SDValue Idx = Op.getOperand(1);
38732
38733 // If the index vector is never negative (MSB is zero), then all elements
38734 // come from the source vector. This is useful for cases where
38735 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38736 // below will handle the more common constant shuffle mask case.
38737 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38738 if (KnownIdx.isNonNegative())
38739 Known = DAG.computeKnownBits(Src, Depth + 1);
38740 break;
38741 }
38742 case X86ISD::VBROADCAST: {
38743 SDValue Src = Op.getOperand(0);
38744 if (!Src.getSimpleValueType().isVector()) {
38745 Known = DAG.computeKnownBits(Src, Depth + 1);
38746 return;
38747 }
38748 break;
38749 }
38750 case X86ISD::AND: {
38751 if (Op.getResNo() == 0) {
38752 KnownBits Known2;
38753 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38754 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38755 Known &= Known2;
38756 }
38757 break;
38758 }
38759 case X86ISD::ANDNP: {
38760 KnownBits Known2;
38761 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38762 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38763
38764 // ANDNP = (~X & Y);
38765 Known.One &= Known2.Zero;
38766 Known.Zero |= Known2.One;
38767 break;
38768 }
38769 case X86ISD::FOR: {
38770 KnownBits Known2;
38771 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38772 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38773
38774 Known |= Known2;
38775 break;
38776 }
38777 case X86ISD::PSADBW: {
38778 SDValue LHS = Op.getOperand(0);
38779 SDValue RHS = Op.getOperand(1);
38780 assert(VT.getScalarType() == MVT::i64 &&
38781 LHS.getValueType() == RHS.getValueType() &&
38782 LHS.getValueType().getScalarType() == MVT::i8 &&
38783 "Unexpected PSADBW types");
38784 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38785 break;
38786 }
38787 case X86ISD::PCMPGT:
38788 case X86ISD::PCMPEQ: {
38789 KnownBits KnownLhs =
38790 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38791 KnownBits KnownRhs =
38792 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38793 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38794 ? KnownBits::eq(KnownLhs, KnownRhs)
38795 : KnownBits::sgt(KnownLhs, KnownRhs);
38796 if (Res) {
38797 if (*Res)
38798 Known.setAllOnes();
38799 else
38800 Known.setAllZero();
38801 }
38802 break;
38803 }
38804 case X86ISD::VPMADDWD: {
38805 SDValue LHS = Op.getOperand(0);
38806 SDValue RHS = Op.getOperand(1);
38807 assert(VT.getVectorElementType() == MVT::i32 &&
38808 LHS.getValueType() == RHS.getValueType() &&
38809 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38810 "Unexpected PMADDWD types");
38811 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38812 break;
38813 }
38814 case X86ISD::VPMADDUBSW: {
38815 SDValue LHS = Op.getOperand(0);
38816 SDValue RHS = Op.getOperand(1);
38817 assert(VT.getVectorElementType() == MVT::i16 &&
38818 LHS.getValueType() == RHS.getValueType() &&
38819 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38820 "Unexpected PMADDUBSW types");
38821 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38822 break;
38823 }
38824 case X86ISD::PMULUDQ: {
38825 KnownBits Known2;
38826 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38827 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38828
38829 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38830 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38831 Known = KnownBits::mul(Known, Known2);
38832 break;
38833 }
38834 case X86ISD::CMOV: {
38835 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38836 // If we don't know any bits, early out.
38837 if (Known.isUnknown())
38838 break;
38839 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38840
38841 // Only known if known in both the LHS and RHS.
38842 Known = Known.intersectWith(Known2);
38843 break;
38844 }
38845 case X86ISD::BEXTR:
38846 case X86ISD::BEXTRI: {
38847 SDValue Op0 = Op.getOperand(0);
38848 SDValue Op1 = Op.getOperand(1);
38849
38850 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38851 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38852 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38853
38854 // If the length is 0, the result is 0.
38855 if (Length == 0) {
38856 Known.setAllZero();
38857 break;
38858 }
38859
38860 if ((Shift + Length) <= BitWidth) {
38861 Known = DAG.computeKnownBits(Op0, Depth + 1);
38862 Known = Known.extractBits(Length, Shift);
38863 Known = Known.zextOrTrunc(BitWidth);
38864 }
38865 }
38866 break;
38867 }
38868 case X86ISD::PDEP: {
38869 KnownBits Known2;
38870 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38871 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38872 // Zeros are retained from the mask operand. But not ones.
38873 Known.One.clearAllBits();
38874 // The result will have at least as many trailing zeros as the non-mask
38875 // operand since bits can only map to the same or higher bit position.
38876 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38877 break;
38878 }
38879 case X86ISD::PEXT: {
38880 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38881 // The result has as many leading zeros as the number of zeroes in the mask.
38882 unsigned Count = Known.Zero.popcount();
38884 Known.One.clearAllBits();
38885 break;
38886 }
38887 case X86ISD::VTRUNC:
38888 case X86ISD::VTRUNCS:
38889 case X86ISD::VTRUNCUS:
38890 case X86ISD::CVTSI2P:
38891 case X86ISD::CVTUI2P:
38892 case X86ISD::CVTP2SI:
38893 case X86ISD::CVTP2UI:
38894 case X86ISD::MCVTP2SI:
38895 case X86ISD::MCVTP2UI:
38896 case X86ISD::CVTTP2SI:
38897 case X86ISD::CVTTP2UI:
38898 case X86ISD::MCVTTP2SI:
38899 case X86ISD::MCVTTP2UI:
38900 case X86ISD::MCVTSI2P:
38901 case X86ISD::MCVTUI2P:
38902 case X86ISD::VFPROUND:
38903 case X86ISD::VMFPROUND:
38904 case X86ISD::CVTPS2PH:
38905 case X86ISD::MCVTPS2PH:
38906 case X86ISD::MCVTTP2SIS:
38907 case X86ISD::MCVTTP2UIS: {
38908 // Truncations/Conversions - upper elements are known zero.
38909 EVT SrcVT = Op.getOperand(0).getValueType();
38910 if (SrcVT.isVector()) {
38911 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38912 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38913 Known.setAllZero();
38914 }
38915 break;
38916 }
38923 // Strict Conversions - upper elements are known zero.
38924 EVT SrcVT = Op.getOperand(1).getValueType();
38925 if (SrcVT.isVector()) {
38926 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38927 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38928 Known.setAllZero();
38929 }
38930 break;
38931 }
38932 case X86ISD::MOVQ2DQ: {
38933 // Move from MMX to XMM. Upper half of XMM should be 0.
38934 if (DemandedElts.countr_zero() >= (NumElts / 2))
38935 Known.setAllZero();
38936 break;
38937 }
38939 APInt UndefElts;
38940 SmallVector<APInt, 16> EltBits;
38941 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38942 /*AllowWholeUndefs*/ false,
38943 /*AllowPartialUndefs*/ false)) {
38944 Known.Zero.setAllBits();
38945 Known.One.setAllBits();
38946 for (unsigned I = 0; I != NumElts; ++I) {
38947 if (!DemandedElts[I])
38948 continue;
38949 if (UndefElts[I]) {
38950 Known.resetAll();
38951 break;
38952 }
38953 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38954 Known = Known.intersectWith(Known2);
38955 }
38956 return;
38957 }
38958 break;
38959 }
38960 case X86ISD::HADD:
38961 case X86ISD::HSUB: {
38963 Op, DemandedElts, Depth, DAG,
38964 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38966 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38967 KnownLHS, KnownRHS);
38968 });
38969 break;
38970 }
38972 switch (Op->getConstantOperandVal(0)) {
38973 case Intrinsic::x86_sse2_pmadd_wd:
38974 case Intrinsic::x86_avx2_pmadd_wd:
38975 case Intrinsic::x86_avx512_pmaddw_d_512: {
38976 SDValue LHS = Op.getOperand(1);
38977 SDValue RHS = Op.getOperand(2);
38978 assert(VT.getScalarType() == MVT::i32 &&
38979 LHS.getValueType() == RHS.getValueType() &&
38980 LHS.getValueType().getScalarType() == MVT::i16 &&
38981 "Unexpected PMADDWD types");
38982 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38983 break;
38984 }
38985 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38986 case Intrinsic::x86_avx2_pmadd_ub_sw:
38987 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38988 SDValue LHS = Op.getOperand(1);
38989 SDValue RHS = Op.getOperand(2);
38990 assert(VT.getScalarType() == MVT::i16 &&
38991 LHS.getValueType() == RHS.getValueType() &&
38992 LHS.getValueType().getScalarType() == MVT::i8 &&
38993 "Unexpected PMADDUBSW types");
38994 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38995 break;
38996 }
38997 case Intrinsic::x86_sse2_psad_bw:
38998 case Intrinsic::x86_avx2_psad_bw:
38999 case Intrinsic::x86_avx512_psad_bw_512: {
39000 SDValue LHS = Op.getOperand(1);
39001 SDValue RHS = Op.getOperand(2);
39002 assert(VT.getScalarType() == MVT::i64 &&
39003 LHS.getValueType() == RHS.getValueType() &&
39004 LHS.getValueType().getScalarType() == MVT::i8 &&
39005 "Unexpected PSADBW types");
39006 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39007 break;
39008 }
39009 }
39010 break;
39011 }
39012 case X86ISD::VPMADD52L:
39013 case X86ISD::VPMADD52H: {
39014 assert(Op.getValueType().isVector() &&
39015 Op.getValueType().getScalarType() == MVT::i64 &&
39016 "Unexpected VPMADD52 type");
39017 KnownBits K0 =
39018 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39019 KnownBits K1 =
39020 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39021 KnownBits KAcc =
39022 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39023 K0 = K0.trunc(52);
39024 K1 = K1.trunc(52);
39025 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39026 ? KnownBits::mul(K0, K1)
39027 : KnownBits::mulhu(K0, K1);
39028 KnownMul = KnownMul.zext(64);
39029 Known = KnownBits::add(KAcc, KnownMul);
39030 return;
39031 }
39032 }
39033
39034 // Handle target shuffles.
39035 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39036 if (isTargetShuffle(Opc)) {
39039 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39040 unsigned NumOps = Ops.size();
39041 unsigned NumElts = VT.getVectorNumElements();
39042 if (Mask.size() == NumElts) {
39043 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39044 Known.Zero.setAllBits(); Known.One.setAllBits();
39045 for (unsigned i = 0; i != NumElts; ++i) {
39046 if (!DemandedElts[i])
39047 continue;
39048 int M = Mask[i];
39049 if (M == SM_SentinelUndef) {
39050 // For UNDEF elements, we don't know anything about the common state
39051 // of the shuffle result.
39052 Known.resetAll();
39053 break;
39054 }
39055 if (M == SM_SentinelZero) {
39056 Known.One.clearAllBits();
39057 continue;
39058 }
39059 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39060 "Shuffle index out of range");
39061
39062 unsigned OpIdx = (unsigned)M / NumElts;
39063 unsigned EltIdx = (unsigned)M % NumElts;
39064 if (Ops[OpIdx].getValueType() != VT) {
39065 // TODO - handle target shuffle ops with different value types.
39066 Known.resetAll();
39067 break;
39068 }
39069 DemandedOps[OpIdx].setBit(EltIdx);
39070 }
39071 // Known bits are the values that are shared by every demanded element.
39072 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39073 if (!DemandedOps[i])
39074 continue;
39075 KnownBits Known2 =
39076 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39077 Known = Known.intersectWith(Known2);
39078 }
39079 }
39080 }
39081 }
39082}
39083
39085 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39086 unsigned Depth) const {
39087 EVT VT = Op.getValueType();
39088 unsigned VTBits = VT.getScalarSizeInBits();
39089 unsigned Opcode = Op.getOpcode();
39090 switch (Opcode) {
39092 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39093 return VTBits;
39094
39095 case X86ISD::VTRUNC: {
39096 SDValue Src = Op.getOperand(0);
39097 MVT SrcVT = Src.getSimpleValueType();
39098 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39099 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39100 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39101 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39102 if (Tmp > (NumSrcBits - VTBits))
39103 return Tmp - (NumSrcBits - VTBits);
39104 return 1;
39105 }
39106
39107 case X86ISD::PACKSS: {
39108 // PACKSS is just a truncation if the sign bits extend to the packed size.
39109 APInt DemandedLHS, DemandedRHS;
39110 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39111 DemandedRHS);
39112
39113 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39114 // patterns often used to compact vXi64 allsignbit patterns.
39115 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39117 if (BC.getOpcode() == X86ISD::PACKSS &&
39118 BC.getScalarValueSizeInBits() == 16 &&
39119 V.getScalarValueSizeInBits() == 32) {
39122 if (BC0.getScalarValueSizeInBits() == 64 &&
39123 BC1.getScalarValueSizeInBits() == 64 &&
39124 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39125 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39126 return 32;
39127 }
39128 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39129 };
39130
39131 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39132 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39133 if (!!DemandedLHS)
39134 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39135 if (!!DemandedRHS)
39136 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39137 unsigned Tmp = std::min(Tmp0, Tmp1);
39138 if (Tmp > (SrcBits - VTBits))
39139 return Tmp - (SrcBits - VTBits);
39140 return 1;
39141 }
39142
39143 case X86ISD::VBROADCAST: {
39144 SDValue Src = Op.getOperand(0);
39145 if (!Src.getSimpleValueType().isVector())
39146 return DAG.ComputeNumSignBits(Src, Depth + 1);
39147 break;
39148 }
39149
39150 case X86ISD::VSHLI: {
39151 SDValue Src = Op.getOperand(0);
39152 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39153 if (ShiftVal.uge(VTBits))
39154 return VTBits; // Shifted all bits out --> zero.
39155 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39156 if (ShiftVal.uge(Tmp))
39157 return 1; // Shifted all sign bits out --> unknown.
39158 return Tmp - ShiftVal.getZExtValue();
39159 }
39160
39161 case X86ISD::VSRAI: {
39162 SDValue Src = Op.getOperand(0);
39163 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39164 if (ShiftVal.uge(VTBits - 1))
39165 return VTBits; // Sign splat.
39166 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39167 ShiftVal += Tmp;
39168 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39169 }
39170
39171 case X86ISD::FSETCC:
39172 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39173 if (VT == MVT::f32 || VT == MVT::f64 ||
39174 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39175 return VTBits;
39176 break;
39177
39178 case X86ISD::PCMPGT:
39179 case X86ISD::PCMPEQ:
39180 case X86ISD::CMPP:
39181 case X86ISD::VPCOM:
39182 case X86ISD::VPCOMU:
39183 // Vector compares return zero/all-bits result values.
39184 return VTBits;
39185
39186 case X86ISD::ANDNP: {
39187 unsigned Tmp0 =
39188 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39189 if (Tmp0 == 1) return 1; // Early out.
39190 unsigned Tmp1 =
39191 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39192 return std::min(Tmp0, Tmp1);
39193 }
39194
39195 case X86ISD::CMOV: {
39196 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39197 if (Tmp0 == 1) return 1; // Early out.
39198 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39199 return std::min(Tmp0, Tmp1);
39200 }
39201 }
39202
39203 // Handle target shuffles.
39204 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39205 if (isTargetShuffle(Opcode)) {
39208 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39209 unsigned NumOps = Ops.size();
39210 unsigned NumElts = VT.getVectorNumElements();
39211 if (Mask.size() == NumElts) {
39212 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39213 for (unsigned i = 0; i != NumElts; ++i) {
39214 if (!DemandedElts[i])
39215 continue;
39216 int M = Mask[i];
39217 if (M == SM_SentinelUndef) {
39218 // For UNDEF elements, we don't know anything about the common state
39219 // of the shuffle result.
39220 return 1;
39221 } else if (M == SM_SentinelZero) {
39222 // Zero = all sign bits.
39223 continue;
39224 }
39225 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39226 "Shuffle index out of range");
39227
39228 unsigned OpIdx = (unsigned)M / NumElts;
39229 unsigned EltIdx = (unsigned)M % NumElts;
39230 if (Ops[OpIdx].getValueType() != VT) {
39231 // TODO - handle target shuffle ops with different value types.
39232 return 1;
39233 }
39234 DemandedOps[OpIdx].setBit(EltIdx);
39235 }
39236 unsigned Tmp0 = VTBits;
39237 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39238 if (!DemandedOps[i])
39239 continue;
39240 unsigned Tmp1 =
39241 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39242 Tmp0 = std::min(Tmp0, Tmp1);
39243 }
39244 return Tmp0;
39245 }
39246 }
39247 }
39248
39249 // Fallback case.
39250 return 1;
39251}
39252
39254 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39255 return N->getOperand(0);
39256 return N;
39257}
39258
39259// Helper to look for a normal load that can be narrowed into a vzload with the
39260// specified VT and memory VT. Returns SDValue() on failure.
39262 SelectionDAG &DAG) {
39263 // Can't if the load is volatile or atomic.
39264 if (!LN->isSimple())
39265 return SDValue();
39266
39267 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39268 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39269 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39270 LN->getPointerInfo(), LN->getBaseAlign(),
39271 LN->getMemOperand()->getFlags());
39272}
39273
39274// Attempt to match a combined shuffle mask against supported unary shuffle
39275// instructions.
39276// TODO: Investigate sharing more of this with shuffle lowering.
39277static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39278 bool AllowFloatDomain, bool AllowIntDomain,
39279 SDValue V1, const SelectionDAG &DAG,
39280 const X86Subtarget &Subtarget, unsigned &Shuffle,
39281 MVT &SrcVT, MVT &DstVT) {
39282 unsigned NumMaskElts = Mask.size();
39283 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39284
39285 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39286 if (Mask[0] == 0 &&
39287 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39288 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39290 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39291 Shuffle = X86ISD::VZEXT_MOVL;
39292 if (MaskEltSize == 16)
39293 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39294 else
39295 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39296 return true;
39297 }
39298 }
39299
39300 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39301 if (AllowIntDomain &&
39302 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39303 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39304 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39305 unsigned MaxScale = 64 / MaskEltSize;
39306 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39307 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39308 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39309 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39310 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39311 continue;
39312 bool MatchAny = true;
39313 bool MatchZero = true;
39314 bool MatchSign = UseSign;
39315 unsigned NumDstElts = NumMaskElts / Scale;
39316 for (unsigned i = 0;
39317 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39318 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39319 MatchAny = MatchSign = MatchZero = false;
39320 break;
39321 }
39322 unsigned Pos = (i * Scale) + 1;
39323 unsigned Len = Scale - 1;
39324 MatchAny &= isUndefInRange(Mask, Pos, Len);
39325 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39326 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39327 }
39328 if (MatchAny || MatchSign || MatchZero) {
39329 assert((MatchSign || MatchZero) &&
39330 "Failed to match sext/zext but matched aext?");
39331 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39332 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39333 : MVT::getIntegerVT(MaskEltSize);
39334 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39335
39336 Shuffle = unsigned(
39337 MatchAny ? ISD::ANY_EXTEND
39338 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39339 if (SrcVT.getVectorNumElements() != NumDstElts)
39340 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39341
39342 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39343 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39344 return true;
39345 }
39346 }
39347 }
39348
39349 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39350 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39351 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39352 isUndefOrEqual(Mask[0], 0) &&
39353 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39354 Shuffle = X86ISD::VZEXT_MOVL;
39355 if (MaskEltSize == 16)
39356 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39357 else
39358 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39359 return true;
39360 }
39361
39362 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39363 // instructions are no slower than UNPCKLPD but has the option to
39364 // fold the input operand into even an unaligned memory load.
39365 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39366 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39367 Shuffle = X86ISD::MOVDDUP;
39368 SrcVT = DstVT = MVT::v2f64;
39369 return true;
39370 }
39371 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39372 Shuffle = X86ISD::MOVSLDUP;
39373 SrcVT = DstVT = MVT::v4f32;
39374 return true;
39375 }
39376 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39377 Shuffle = X86ISD::MOVSHDUP;
39378 SrcVT = DstVT = MVT::v4f32;
39379 return true;
39380 }
39381 }
39382
39383 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39384 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39385 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39386 Shuffle = X86ISD::MOVDDUP;
39387 SrcVT = DstVT = MVT::v4f64;
39388 return true;
39389 }
39390 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39391 V1)) {
39392 Shuffle = X86ISD::MOVSLDUP;
39393 SrcVT = DstVT = MVT::v8f32;
39394 return true;
39395 }
39396 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39397 V1)) {
39398 Shuffle = X86ISD::MOVSHDUP;
39399 SrcVT = DstVT = MVT::v8f32;
39400 return true;
39401 }
39402 }
39403
39404 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39405 assert(Subtarget.hasAVX512() &&
39406 "AVX512 required for 512-bit vector shuffles");
39407 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39408 V1)) {
39409 Shuffle = X86ISD::MOVDDUP;
39410 SrcVT = DstVT = MVT::v8f64;
39411 return true;
39412 }
39414 MaskVT, Mask,
39415 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39416 Shuffle = X86ISD::MOVSLDUP;
39417 SrcVT = DstVT = MVT::v16f32;
39418 return true;
39419 }
39421 MaskVT, Mask,
39422 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39423 Shuffle = X86ISD::MOVSHDUP;
39424 SrcVT = DstVT = MVT::v16f32;
39425 return true;
39426 }
39427 }
39428
39429 return false;
39430}
39431
39432// Attempt to match a combined shuffle mask against supported unary immediate
39433// permute instructions.
39434// TODO: Investigate sharing more of this with shuffle lowering.
39436 const APInt &Zeroable,
39437 bool AllowFloatDomain, bool AllowIntDomain,
39438 const SelectionDAG &DAG,
39439 const X86Subtarget &Subtarget,
39440 unsigned &Shuffle, MVT &ShuffleVT,
39441 unsigned &PermuteImm) {
39442 unsigned NumMaskElts = Mask.size();
39443 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39444 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39445 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39446 bool ContainsZeros = isAnyZero(Mask);
39447
39448 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39449 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39450 // Check for lane crossing permutes.
39451 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39452 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39453 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39454 Shuffle = X86ISD::VPERMI;
39455 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39456 PermuteImm = getV4X86ShuffleImm(Mask);
39457 return true;
39458 }
39459 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39460 SmallVector<int, 4> RepeatedMask;
39461 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39462 Shuffle = X86ISD::VPERMI;
39463 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39464 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39465 return true;
39466 }
39467 }
39468 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39469 // VPERMILPD can permute with a non-repeating shuffle.
39470 Shuffle = X86ISD::VPERMILPI;
39471 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39472 PermuteImm = 0;
39473 for (int i = 0, e = Mask.size(); i != e; ++i) {
39474 int M = Mask[i];
39475 if (M == SM_SentinelUndef)
39476 continue;
39477 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39478 PermuteImm |= (M & 1) << i;
39479 }
39480 return true;
39481 }
39482 }
39483
39484 // We are checking for shuffle match or shift match. Loop twice so we can
39485 // order which we try and match first depending on target preference.
39486 for (unsigned Order = 0; Order < 2; ++Order) {
39487 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39488 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39489 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39490 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39491 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39492 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39493 SmallVector<int, 4> RepeatedMask;
39494 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39495 // Narrow the repeated mask to create 32-bit element permutes.
39496 SmallVector<int, 4> WordMask = RepeatedMask;
39497 if (MaskScalarSizeInBits == 64)
39498 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39499
39500 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39501 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39502 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39503 PermuteImm = getV4X86ShuffleImm(WordMask);
39504 return true;
39505 }
39506 }
39507
39508 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39509 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39510 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39511 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39512 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39513 SmallVector<int, 4> RepeatedMask;
39514 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39515 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39516 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39517
39518 // PSHUFLW: permute lower 4 elements only.
39519 if (isUndefOrInRange(LoMask, 0, 4) &&
39520 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39521 Shuffle = X86ISD::PSHUFLW;
39522 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39523 PermuteImm = getV4X86ShuffleImm(LoMask);
39524 return true;
39525 }
39526
39527 // PSHUFHW: permute upper 4 elements only.
39528 if (isUndefOrInRange(HiMask, 4, 8) &&
39529 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39530 // Offset the HiMask so that we can create the shuffle immediate.
39531 int OffsetHiMask[4];
39532 for (int i = 0; i != 4; ++i)
39533 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39534
39535 Shuffle = X86ISD::PSHUFHW;
39536 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39537 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39538 return true;
39539 }
39540 }
39541 }
39542 } else {
39543 // Attempt to match against bit rotates.
39544 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39545 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39546 Subtarget.hasAVX512())) {
39547 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39548 Subtarget, Mask);
39549 if (0 < RotateAmt) {
39550 Shuffle = X86ISD::VROTLI;
39551 PermuteImm = (unsigned)RotateAmt;
39552 return true;
39553 }
39554 }
39555 }
39556 // Attempt to match against byte/bit shifts.
39557 if (AllowIntDomain &&
39558 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39559 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39560 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39561 int ShiftAmt =
39562 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39563 Zeroable, Subtarget);
39564 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39565 32 <= ShuffleVT.getScalarSizeInBits())) {
39566 // Byte shifts can be slower so only match them on second attempt.
39567 if (Order == 0 &&
39568 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39569 continue;
39570
39571 PermuteImm = (unsigned)ShiftAmt;
39572 return true;
39573 }
39574
39575 }
39576 }
39577
39578 return false;
39579}
39580
39581// Attempt to match a combined unary shuffle mask against supported binary
39582// shuffle instructions.
39583// TODO: Investigate sharing more of this with shuffle lowering.
39584static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39585 bool AllowFloatDomain, bool AllowIntDomain,
39586 SDValue &V1, SDValue &V2, const SDLoc &DL,
39587 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39588 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39589 bool IsUnary) {
39590 unsigned NumMaskElts = Mask.size();
39591 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39592 unsigned SizeInBits = MaskVT.getSizeInBits();
39593
39594 if (MaskVT.is128BitVector()) {
39595 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39596 AllowFloatDomain) {
39597 V2 = V1;
39598 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39599 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39600 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39601 return true;
39602 }
39603 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39604 AllowFloatDomain) {
39605 V2 = V1;
39606 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39607 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39608 return true;
39609 }
39610 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39611 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39612 std::swap(V1, V2);
39613 Shuffle = X86ISD::MOVSD;
39614 SrcVT = DstVT = MVT::v2f64;
39615 return true;
39616 }
39617 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39618 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39619 Shuffle = X86ISD::MOVSS;
39620 SrcVT = DstVT = MVT::v4f32;
39621 return true;
39622 }
39623 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39624 DAG) &&
39625 Subtarget.hasFP16()) {
39626 Shuffle = X86ISD::MOVSH;
39627 SrcVT = DstVT = MVT::v8f16;
39628 return true;
39629 }
39630 }
39631
39632 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39633 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39634 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39635 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39636 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39637 Subtarget)) {
39638 DstVT = MaskVT;
39639 return true;
39640 }
39641 }
39642 // TODO: Can we handle this inside matchShuffleWithPACK?
39643 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39644 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39645 V1.getScalarValueSizeInBits() == 64 &&
39646 V2.getScalarValueSizeInBits() == 64) {
39647 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39648 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39649 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39650 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39651 SrcVT = MVT::v4i32;
39652 DstVT = MVT::v8i16;
39653 Shuffle = X86ISD::PACKUS;
39654 return true;
39655 }
39656 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39657 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39658 SrcVT = MVT::v8i16;
39659 DstVT = MVT::v16i8;
39660 Shuffle = X86ISD::PACKUS;
39661 return true;
39662 }
39663 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39664 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39665 SrcVT = MVT::v4i32;
39666 DstVT = MVT::v8i16;
39667 Shuffle = X86ISD::PACKSS;
39668 return true;
39669 }
39670 }
39671
39672 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39673 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39674 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39675 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39676 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39677 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39678 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39679 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39680 Subtarget)) {
39681 SrcVT = DstVT = MaskVT;
39682 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39683 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39684 return true;
39685 }
39686 }
39687
39688 // Attempt to match against a OR if we're performing a blend shuffle and the
39689 // non-blended source element is zero in each case.
39690 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39691 if (SizeInBits == V1.getValueSizeInBits() &&
39692 SizeInBits == V2.getValueSizeInBits() &&
39693 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39694 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39695 bool IsBlend = true;
39696 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39697 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39698 unsigned Scale1 = NumV1Elts / NumMaskElts;
39699 unsigned Scale2 = NumV2Elts / NumMaskElts;
39700 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39701 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39702 for (unsigned i = 0; i != NumMaskElts; ++i) {
39703 int M = Mask[i];
39704 if (M == SM_SentinelUndef)
39705 continue;
39706 if (M == SM_SentinelZero) {
39707 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39708 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39709 continue;
39710 }
39711 if (M == (int)i) {
39712 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39713 continue;
39714 }
39715 if (M == (int)(i + NumMaskElts)) {
39716 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39717 continue;
39718 }
39719 IsBlend = false;
39720 break;
39721 }
39722 if (IsBlend) {
39723 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39724 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39725 Shuffle = ISD::OR;
39726 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39727 return true;
39728 }
39729 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39730 // FIXME: handle mismatched sizes?
39731 // TODO: investigate if `ISD::OR` handling in
39732 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39733 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39734 unsigned NumElts = V.getValueType().getVectorNumElements();
39735 KnownBits Known(NumElts);
39736 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39737 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39738 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39739 if (PeepholeKnown.isZero())
39740 Known.Zero.setBit(EltIdx);
39741 if (PeepholeKnown.isAllOnes())
39742 Known.One.setBit(EltIdx);
39743 }
39744 return Known;
39745 };
39746
39747 KnownBits V1Known = computeKnownBitsElementWise(V1);
39748 KnownBits V2Known = computeKnownBitsElementWise(V2);
39749
39750 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39751 int M = Mask[i];
39752 if (M == SM_SentinelUndef)
39753 continue;
39754 if (M == SM_SentinelZero) {
39755 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39756 continue;
39757 }
39758 if (M == (int)i) {
39759 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39760 continue;
39761 }
39762 if (M == (int)(i + NumMaskElts)) {
39763 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39764 continue;
39765 }
39766 llvm_unreachable("will not get here.");
39767 }
39768 if (IsBlend) {
39769 Shuffle = ISD::OR;
39770 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39771 return true;
39772 }
39773 }
39774 }
39775 }
39776
39777 return false;
39778}
39779
39781 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39782 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39783 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39784 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39785 unsigned NumMaskElts = Mask.size();
39786 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39787
39788 // Attempt to match against VALIGND/VALIGNQ rotate.
39789 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39790 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39791 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39792 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39793 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39794 MaskVT.getSizeInBits() / EltSizeInBits);
39795 if (!isAnyZero(Mask)) {
39796 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39797 if (0 < Rotation) {
39798 Shuffle = X86ISD::VALIGN;
39799 ShuffleVT = AlignVT;
39800 PermuteImm = Rotation;
39801 return true;
39802 }
39803 }
39804 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39805 unsigned ZeroLo = Zeroable.countr_one();
39806 unsigned ZeroHi = Zeroable.countl_one();
39807 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39808 if (ZeroLo) {
39809 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39810 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39811 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39812 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39813 Shuffle = X86ISD::VALIGN;
39814 ShuffleVT = AlignVT;
39815 PermuteImm = NumMaskElts - ZeroLo;
39816 return true;
39817 }
39818 }
39819 if (ZeroHi) {
39820 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39821 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39822 ZeroHi);
39823 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39824 V2 = V1;
39825 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39826 Shuffle = X86ISD::VALIGN;
39827 ShuffleVT = AlignVT;
39828 PermuteImm = ZeroHi;
39829 return true;
39830 }
39831 }
39832 }
39833
39834 // Attempt to match against PALIGNR byte rotate.
39835 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39836 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39837 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39838 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39839 if (0 < ByteRotation) {
39840 Shuffle = X86ISD::PALIGNR;
39841 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39842 PermuteImm = ByteRotation;
39843 return true;
39844 }
39845 }
39846
39847 // Attempt to combine to X86ISD::BLENDI.
39848 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39849 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39850 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39851 uint64_t BlendMask = 0;
39852 bool ForceV1Zero = false, ForceV2Zero = false;
39853 SmallVector<int, 8> TargetMask(Mask);
39854 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39855 ForceV2Zero, BlendMask)) {
39856 if (MaskVT == MVT::v16i16) {
39857 // We can only use v16i16 PBLENDW if the lanes are repeated.
39858 SmallVector<int, 8> RepeatedMask;
39859 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39860 RepeatedMask)) {
39861 assert(RepeatedMask.size() == 8 &&
39862 "Repeated mask size doesn't match!");
39863 PermuteImm = 0;
39864 for (int i = 0; i < 8; ++i)
39865 if (RepeatedMask[i] >= 8)
39866 PermuteImm |= 1 << i;
39867 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39868 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39869 Shuffle = X86ISD::BLENDI;
39870 ShuffleVT = MaskVT;
39871 return true;
39872 }
39873 } else {
39874 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39875 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39876 PermuteImm = (unsigned)BlendMask;
39877 Shuffle = X86ISD::BLENDI;
39878 ShuffleVT = MaskVT;
39879 return true;
39880 }
39881 }
39882 }
39883
39884 // Attempt to combine to INSERTPS, but only if it has elements that need to
39885 // be set to zero.
39886 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39887 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39888 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39889 Shuffle = X86ISD::INSERTPS;
39890 ShuffleVT = MVT::v4f32;
39891 return true;
39892 }
39893
39894 // Attempt to combine to SHUFPD.
39895 if (AllowFloatDomain && EltSizeInBits == 64 &&
39896 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39897 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39898 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39899 bool ForceV1Zero = false, ForceV2Zero = false;
39900 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39901 PermuteImm, Mask, Zeroable)) {
39902 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39903 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39904 Shuffle = X86ISD::SHUFP;
39905 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39906 return true;
39907 }
39908 }
39909
39910 // Attempt to combine to SHUFPS.
39911 if (AllowFloatDomain && EltSizeInBits == 32 &&
39912 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39913 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39914 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39915 SmallVector<int, 4> RepeatedMask;
39916 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39917 // Match each half of the repeated mask, to determine if its just
39918 // referencing one of the vectors, is zeroable or entirely undef.
39919 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39920 int M0 = RepeatedMask[Offset];
39921 int M1 = RepeatedMask[Offset + 1];
39922
39923 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39924 return DAG.getUNDEF(MaskVT);
39925 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39926 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39927 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39928 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39929 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39930 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39931 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39932 return V1;
39933 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39934 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39935 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39936 return V2;
39937 }
39938
39939 return SDValue();
39940 };
39941
39942 int ShufMask[4] = {-1, -1, -1, -1};
39943 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39944 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39945
39946 if (Lo && Hi) {
39947 V1 = Lo;
39948 V2 = Hi;
39949 Shuffle = X86ISD::SHUFP;
39950 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39951 PermuteImm = getV4X86ShuffleImm(ShufMask);
39952 return true;
39953 }
39954 }
39955 }
39956
39957 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39958 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39959 MaskVT.is128BitVector() &&
39960 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39961 Shuffle = X86ISD::INSERTPS;
39962 ShuffleVT = MVT::v4f32;
39963 return true;
39964 }
39965
39966 return false;
39967}
39968
39970 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39971 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39972 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39973 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39974 const X86Subtarget &Subtarget);
39975
39976/// Combine an arbitrary chain of shuffles into a single instruction if
39977/// possible.
39978///
39979/// This is the leaf of the recursive combine below. When we have found some
39980/// chain of single-use x86 shuffle instructions and accumulated the combined
39981/// shuffle mask represented by them, this will try to pattern match that mask
39982/// into either a single instruction if there is a special purpose instruction
39983/// for this operation, or into a PSHUFB instruction which is a fully general
39984/// instruction but should only be used to replace chains over a certain depth.
39986 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39987 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39988 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39989 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39990 const X86Subtarget &Subtarget) {
39991 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39992 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39993 "Unexpected number of shuffle inputs!");
39994 unsigned RootSizeInBits = RootVT.getSizeInBits();
39995 unsigned NumRootElts = RootVT.getVectorNumElements();
39996
39997 // Canonicalize shuffle input op to the requested type.
39998 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39999 if (VT.getSizeInBits() > Op.getValueSizeInBits())
40000 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
40001 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40002 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40003 return DAG.getBitcast(VT, Op);
40004 };
40005
40006 // Find the inputs that enter the chain. Note that multiple uses are OK
40007 // here, we're not going to remove the operands we find.
40008 bool UnaryShuffle = (Inputs.size() == 1);
40009 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40010 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40011 : peekThroughBitcasts(Inputs[1]));
40012
40013 MVT VT1 = V1.getSimpleValueType();
40014 MVT VT2 = V2.getSimpleValueType();
40015 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40016 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40017
40018 SDValue Res;
40019
40020 unsigned NumBaseMaskElts = BaseMask.size();
40021 if (NumBaseMaskElts == 1) {
40022 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40023 return CanonicalizeShuffleInput(RootVT, V1);
40024 }
40025
40026 bool OptForSize = DAG.shouldOptForSize();
40027 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40028 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40029 (RootVT.isFloatingPoint() && Depth >= 1) ||
40030 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40031
40032 // If we are shuffling a splat (and not introducing zeros) then we can just
40033 // use it directly. This works for smaller elements as well as they already
40034 // repeat across each mask element.
40035 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40036 V1.getValueSizeInBits() >= RootSizeInBits &&
40037 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40038 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40039 return CanonicalizeShuffleInput(RootVT, V1);
40040 }
40041
40042 SmallVector<int, 64> Mask(BaseMask);
40043
40044 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40045 // etc. can be simplified.
40046 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40047 SmallVector<int> ScaledMask, IdentityMask;
40048 unsigned NumElts = VT1.getVectorNumElements();
40049 if (Mask.size() <= NumElts &&
40050 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40051 for (unsigned i = 0; i != NumElts; ++i)
40052 IdentityMask.push_back(i);
40053 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40054 V2))
40055 return CanonicalizeShuffleInput(RootVT, V1);
40056 }
40057 }
40058
40059 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40060 if (RootVT.is512BitVector() &&
40061 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40062 // If the upper subvectors are zeroable, then an extract+insert is more
40063 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40064 // to zero the upper subvectors.
40065 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40066 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40067 return SDValue(); // Nothing to do!
40068 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40069 "Unexpected lane shuffle");
40070 Res = CanonicalizeShuffleInput(RootVT, V1);
40071 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40072 bool UseZero = isAnyZero(Mask);
40073 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40074 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40075 }
40076
40077 // Narrow shuffle mask to v4x128.
40078 SmallVector<int, 4> ScaledMask;
40079 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40080 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40081
40082 // Try to lower to vshuf64x2/vshuf32x4.
40083 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40084 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40085 SelectionDAG &DAG) {
40086 int PermMask[4] = {-1, -1, -1, -1};
40087 // Ensure elements came from the same Op.
40088 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40089 for (int i = 0; i < 4; ++i) {
40090 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40091 if (ScaledMask[i] < 0)
40092 continue;
40093
40094 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40095 unsigned OpIndex = i / 2;
40096 if (Ops[OpIndex].isUndef())
40097 Ops[OpIndex] = Op;
40098 else if (Ops[OpIndex] != Op)
40099 return SDValue();
40100
40101 PermMask[i] = ScaledMask[i] % 4;
40102 }
40103
40104 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40105 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40106 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40107 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40108 };
40109
40110 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40111 // doesn't work because our mask is for 128 bits and we don't have an MVT
40112 // to match that.
40113 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40114 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40115 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40116 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40117 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40118 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40119 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40120 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40121 ScaledMask[1] == (ScaledMask[3] % 2));
40122
40123 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40124 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40125 return SDValue(); // Nothing to do!
40126 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40127 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40128 return DAG.getBitcast(RootVT, V);
40129 }
40130 }
40131
40132 // Handle 128-bit lane shuffles of 256-bit vectors.
40133 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40134 // If the upper half is zeroable, then an extract+insert is more optimal
40135 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40136 // zero the upper half.
40137 if (isUndefOrZero(Mask[1])) {
40138 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40139 return SDValue(); // Nothing to do!
40140 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40141 Res = CanonicalizeShuffleInput(RootVT, V1);
40142 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40143 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40144 256);
40145 }
40146
40147 // If we're inserting the low subvector, an insert-subvector 'concat'
40148 // pattern is quicker than VPERM2X128.
40149 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40150 !Subtarget.hasAVX2()) {
40151 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40152 return SDValue(); // Nothing to do!
40153 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40154 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40155 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40156 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40157 }
40158
40159 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40160 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40161 // feature.
40162 // Prefer blends for sequential shuffles unless we are optimizing for size.
40163 if (UnaryShuffle &&
40164 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40165 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40166 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40167 return SDValue(); // Nothing to do!
40168 unsigned PermMask = 0;
40169 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40170 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40171 return DAG.getNode(
40172 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40173 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40174 }
40175
40176 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40177 return SDValue(); // Nothing to do!
40178
40179 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40180 if (!UnaryShuffle && !IsMaskedShuffle) {
40181 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40182 "Unexpected shuffle sentinel value");
40183 // Prefer blends to X86ISD::VPERM2X128.
40184 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40185 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40186 return SDValue(); // Nothing to do!
40187 unsigned PermMask = 0;
40188 PermMask |= ((Mask[0] & 3) << 0);
40189 PermMask |= ((Mask[1] & 3) << 4);
40190 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40191 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40192 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40193 CanonicalizeShuffleInput(RootVT, LHS),
40194 CanonicalizeShuffleInput(RootVT, RHS),
40195 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40196 }
40197 }
40198 }
40199
40200 // For masks that have been widened to 128-bit elements or more,
40201 // narrow back down to 64-bit elements.
40202 if (BaseMaskEltSizeInBits > 64) {
40203 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40204 int MaskScale = BaseMaskEltSizeInBits / 64;
40205 SmallVector<int, 64> ScaledMask;
40206 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40207 Mask = std::move(ScaledMask);
40208 }
40209
40210 // For masked shuffles, we're trying to match the root width for better
40211 // writemask folding, attempt to scale the mask.
40212 // TODO - variable shuffles might need this to be widened again.
40213 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40214 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40215 int MaskScale = NumRootElts / Mask.size();
40216 SmallVector<int, 64> ScaledMask;
40217 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40218 Mask = std::move(ScaledMask);
40219 }
40220
40221 unsigned NumMaskElts = Mask.size();
40222 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40224
40225 // Determine the effective mask value type.
40226 FloatDomain &= (32 <= MaskEltSizeInBits);
40227 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40228 : MVT::getIntegerVT(MaskEltSizeInBits);
40229 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40230
40231 // Only allow legal mask types.
40232 if (!TLI.isTypeLegal(MaskVT))
40233 return SDValue();
40234
40235 // Attempt to match the mask against known shuffle patterns.
40236 MVT ShuffleSrcVT, ShuffleVT;
40237 unsigned Shuffle, PermuteImm;
40238
40239 // Which shuffle domains are permitted?
40240 // Permit domain crossing at higher combine depths.
40241 // TODO: Should we indicate which domain is preferred if both are allowed?
40242 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40243 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40244 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40245
40246 // Determine zeroable mask elements.
40247 APInt KnownUndef, KnownZero;
40248 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40249 APInt Zeroable = KnownUndef | KnownZero;
40250
40251 if (UnaryShuffle) {
40252 // Attempt to match against broadcast-from-vector.
40253 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40254 if ((Subtarget.hasAVX2() ||
40255 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40256 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40257 if (isUndefOrEqual(Mask, 0)) {
40258 if (V1.getValueType() == MaskVT &&
40260 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40261 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40262 return SDValue(); // Nothing to do!
40263 Res = V1.getOperand(0);
40264 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40265 return DAG.getBitcast(RootVT, Res);
40266 }
40267 if (Subtarget.hasAVX2()) {
40268 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40269 return SDValue(); // Nothing to do!
40270 Res = CanonicalizeShuffleInput(MaskVT, V1);
40271 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40272 return DAG.getBitcast(RootVT, Res);
40273 }
40274 }
40275 }
40276
40277 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40278 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40279 (!IsMaskedShuffle ||
40280 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40281 if (Depth == 0 && RootOpc == Shuffle)
40282 return SDValue(); // Nothing to do!
40283 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40284 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40285 return DAG.getBitcast(RootVT, Res);
40286 }
40287
40288 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40289 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40290 PermuteImm) &&
40291 (!IsMaskedShuffle ||
40292 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40293 if (Depth == 0 && RootOpc == Shuffle)
40294 return SDValue(); // Nothing to do!
40295 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40296 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40297 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40298 return DAG.getBitcast(RootVT, Res);
40299 }
40300 }
40301
40302 // Attempt to combine to INSERTPS, but only if the inserted element has come
40303 // from a scalar.
40304 // TODO: Handle other insertions here as well?
40305 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40306 Subtarget.hasSSE41() &&
40307 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40308 if (MaskEltSizeInBits == 32) {
40309 SDValue SrcV1 = V1, SrcV2 = V2;
40310 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40311 DAG) &&
40312 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40313 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40314 return SDValue(); // Nothing to do!
40315 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40316 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40317 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40318 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40319 return DAG.getBitcast(RootVT, Res);
40320 }
40321 }
40322 if (MaskEltSizeInBits == 64 &&
40323 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40325 V2.getScalarValueSizeInBits() <= 32) {
40326 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40327 return SDValue(); // Nothing to do!
40328 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40329 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40330 CanonicalizeShuffleInput(MVT::v4f32, V1),
40331 CanonicalizeShuffleInput(MVT::v4f32, V2),
40332 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40333 return DAG.getBitcast(RootVT, Res);
40334 }
40335 }
40336
40337 SDValue NewV1 = V1; // Save operands in case early exit happens.
40338 SDValue NewV2 = V2;
40339 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40340 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40341 ShuffleVT, UnaryShuffle) &&
40342 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40343 if (Depth == 0 && RootOpc == Shuffle)
40344 return SDValue(); // Nothing to do!
40345 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40346 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40347 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40348 return DAG.getBitcast(RootVT, Res);
40349 }
40350
40351 NewV1 = V1; // Save operands in case early exit happens.
40352 NewV2 = V2;
40353 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40354 AllowIntDomain, NewV1, NewV2, DL, DAG,
40355 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40356 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40357 if (Depth == 0 && RootOpc == Shuffle)
40358 return SDValue(); // Nothing to do!
40359 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40360 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40361 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40362 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40363 return DAG.getBitcast(RootVT, Res);
40364 }
40365
40366 // Typically from here on, we need an integer version of MaskVT.
40367 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40368 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40369
40370 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40371 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40372 uint64_t BitLen, BitIdx;
40373 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40374 Zeroable)) {
40375 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40376 return SDValue(); // Nothing to do!
40377 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40378 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40379 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40380 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40381 return DAG.getBitcast(RootVT, Res);
40382 }
40383
40384 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40385 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40386 return SDValue(); // Nothing to do!
40387 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40388 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40389 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40390 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40391 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40392 return DAG.getBitcast(RootVT, Res);
40393 }
40394 }
40395
40396 // Match shuffle against TRUNCATE patterns.
40397 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40398 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40399 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40400 Subtarget)) {
40401 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40402 ShuffleSrcVT.getVectorNumElements();
40403 unsigned Opc =
40405 if (Depth == 0 && RootOpc == Opc)
40406 return SDValue(); // Nothing to do!
40407 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40408 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40409 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40410 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40411 return DAG.getBitcast(RootVT, Res);
40412 }
40413
40414 // Do we need a more general binary truncation pattern?
40415 if (RootSizeInBits < 512 &&
40416 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40417 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40418 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40419 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40420 // Bail if this was already a truncation or PACK node.
40421 // We sometimes fail to match PACK if we demand known undef elements.
40422 if (Depth == 0 &&
40423 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40424 RootOpc == X86ISD::PACKUS))
40425 return SDValue(); // Nothing to do!
40426 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40427 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40428 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40429 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40430 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40431 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40432 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40433 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40434 return DAG.getBitcast(RootVT, Res);
40435 }
40436 }
40437
40438 // Don't try to re-form single instruction chains under any circumstances now
40439 // that we've done encoding canonicalization for them.
40440 if (Depth < 1)
40441 return SDValue();
40442
40443 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40444 return isTargetShuffleVariableMask(N->getOpcode());
40445 });
40446 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40447 return (N->getOpcode() == X86ISD::VPERMV3 ||
40448 N->getOpcode() == X86ISD::VPERMV);
40449 });
40450
40451 // Depth threshold above which we can efficiently use variable mask shuffles.
40452 int VariableCrossLaneShuffleDepth =
40453 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40454 int VariablePerLaneShuffleDepth =
40455 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40456 AllowVariableCrossLaneMask &=
40457 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40458 AllowVariablePerLaneMask &=
40459 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40460 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40461 // higher depth before combining them.
40462 int BWIVPERMV3ShuffleDepth =
40463 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40464 bool AllowBWIVPERMV3 =
40465 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40466
40467 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40468 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40469 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40470
40471 bool MaskContainsZeros = isAnyZero(Mask);
40472
40473 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40474 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40475 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40476 if (Subtarget.hasAVX2() &&
40477 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40478 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40479 Res = CanonicalizeShuffleInput(MaskVT, V1);
40480 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40481 return DAG.getBitcast(RootVT, Res);
40482 }
40483 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40484 if ((Subtarget.hasAVX512() &&
40485 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40486 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40487 (Subtarget.hasBWI() &&
40488 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40489 (Subtarget.hasVBMI() &&
40490 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40491 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40492 V2 = DAG.getUNDEF(MaskVT);
40493 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40494 return DAG.getBitcast(RootVT, Res);
40495 }
40496 }
40497
40498 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40499 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40500 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40501 ((Subtarget.hasAVX512() &&
40502 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40503 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40504 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40505 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40506 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40508 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40509 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40510 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40511 for (unsigned i = 0; i != NumMaskElts; ++i)
40512 if (Mask[i] == SM_SentinelZero)
40513 Mask[i] = NumMaskElts + i;
40514 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40515 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40516 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40517 return DAG.getBitcast(RootVT, Res);
40518 }
40519
40520 // If that failed and either input is extracted then try to combine as a
40521 // shuffle with the larger type.
40523 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40524 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40525 IsMaskedShuffle, DAG, DL, Subtarget))
40526 return WideShuffle;
40527
40528 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40529 // (non-VLX will pad to 512-bit shuffles).
40530 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40531 ((Subtarget.hasAVX512() &&
40532 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40533 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40534 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40535 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40536 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40537 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40538 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40539 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40540 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40541 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40542 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40543 return DAG.getBitcast(RootVT, Res);
40544 }
40545 return SDValue();
40546 }
40547
40548 // See if we can combine a single input shuffle with zeros to a bit-mask,
40549 // which is much simpler than any shuffle.
40550 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40551 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40552 TLI.isTypeLegal(MaskVT)) {
40553 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40554 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40555 APInt UndefElts(NumMaskElts, 0);
40556 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40557 for (unsigned i = 0; i != NumMaskElts; ++i) {
40558 int M = Mask[i];
40559 if (M == SM_SentinelUndef) {
40560 UndefElts.setBit(i);
40561 continue;
40562 }
40563 if (M == SM_SentinelZero)
40564 continue;
40565 EltBits[i] = AllOnes;
40566 }
40567 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40568 Res = CanonicalizeShuffleInput(MaskVT, V1);
40569 unsigned AndOpcode =
40571 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40572 return DAG.getBitcast(RootVT, Res);
40573 }
40574
40575 // If we have a single input shuffle with different shuffle patterns in the
40576 // the 128-bit lanes use the variable mask to VPERMILPS.
40577 // TODO Combine other mask types at higher depths.
40578 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40579 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40580 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40581 SmallVector<SDValue, 16> VPermIdx;
40582 for (int M : Mask) {
40583 SDValue Idx =
40584 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40585 VPermIdx.push_back(Idx);
40586 }
40587 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40588 Res = CanonicalizeShuffleInput(MaskVT, V1);
40589 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40590 return DAG.getBitcast(RootVT, Res);
40591 }
40592
40593 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40594 // to VPERMIL2PD/VPERMIL2PS.
40595 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40596 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40597 MaskVT == MVT::v8f32)) {
40598 // VPERMIL2 Operation.
40599 // Bits[3] - Match Bit.
40600 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40601 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40602 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40603 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40604 SmallVector<int, 8> VPerm2Idx;
40605 unsigned M2ZImm = 0;
40606 for (int M : Mask) {
40607 if (M == SM_SentinelUndef) {
40608 VPerm2Idx.push_back(-1);
40609 continue;
40610 }
40611 if (M == SM_SentinelZero) {
40612 M2ZImm = 2;
40613 VPerm2Idx.push_back(8);
40614 continue;
40615 }
40616 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40617 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40618 VPerm2Idx.push_back(Index);
40619 }
40620 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40621 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40622 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40623 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40624 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40625 return DAG.getBitcast(RootVT, Res);
40626 }
40627
40628 // If we have 3 or more shuffle instructions or a chain involving a variable
40629 // mask, we can replace them with a single PSHUFB instruction profitably.
40630 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40631 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40632 // more aggressive.
40633 if (UnaryShuffle && AllowVariablePerLaneMask &&
40634 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40635 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40636 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40637 SmallVector<SDValue, 16> PSHUFBMask;
40638 int NumBytes = RootVT.getSizeInBits() / 8;
40639 int Ratio = NumBytes / NumMaskElts;
40640 for (int i = 0; i < NumBytes; ++i) {
40641 int M = Mask[i / Ratio];
40642 if (M == SM_SentinelUndef) {
40643 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40644 continue;
40645 }
40646 if (M == SM_SentinelZero) {
40647 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40648 continue;
40649 }
40650 M = Ratio * M + i % Ratio;
40651 assert((M / 16) == (i / 16) && "Lane crossing detected");
40652 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40653 }
40654 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40655 Res = CanonicalizeShuffleInput(ByteVT, V1);
40656 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40657 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40658 return DAG.getBitcast(RootVT, Res);
40659 }
40660
40661 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40662 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40663 // slower than PSHUFB on targets that support both.
40664 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40665 Subtarget.hasXOP()) {
40666 // VPPERM Mask Operation
40667 // Bits[4:0] - Byte Index (0 - 31)
40668 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40669 SmallVector<SDValue, 16> VPPERMMask;
40670 int NumBytes = 16;
40671 int Ratio = NumBytes / NumMaskElts;
40672 for (int i = 0; i < NumBytes; ++i) {
40673 int M = Mask[i / Ratio];
40674 if (M == SM_SentinelUndef) {
40675 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40676 continue;
40677 }
40678 if (M == SM_SentinelZero) {
40679 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40680 continue;
40681 }
40682 M = Ratio * M + i % Ratio;
40683 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40684 }
40685 MVT ByteVT = MVT::v16i8;
40686 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40687 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40688 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40689 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40690 return DAG.getBitcast(RootVT, Res);
40691 }
40692
40693 // If that failed and either input is extracted then try to combine as a
40694 // shuffle with the larger type.
40696 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40697 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40698 DAG, DL, Subtarget))
40699 return WideShuffle;
40700
40701 // If we have a dual input shuffle then lower to VPERMV3,
40702 // (non-VLX will pad to 512-bit shuffles)
40703 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40704 ((Subtarget.hasAVX512() &&
40705 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40706 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40707 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40708 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40709 MaskVT == MVT::v16i32)) ||
40710 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40711 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40712 MaskVT == MVT::v32i16)) ||
40713 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40714 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40715 MaskVT == MVT::v64i8)))) {
40716 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40717 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40718 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40719 return DAG.getBitcast(RootVT, Res);
40720 }
40721
40722 // Failed to find any combines.
40723 return SDValue();
40724}
40725
40726// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40727// instruction if possible.
40728//
40729// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40730// type size to attempt to combine:
40731// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40732// -->
40733// extract_subvector(shuffle(x,y,m2),0)
40735 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40736 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40737 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40738 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40739 const X86Subtarget &Subtarget) {
40740 unsigned NumMaskElts = BaseMask.size();
40741 unsigned NumInputs = Inputs.size();
40742 if (NumInputs == 0)
40743 return SDValue();
40744
40745 unsigned RootSizeInBits = RootVT.getSizeInBits();
40746 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40747 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40748
40749 // Peek through subvectors to find widest legal vector.
40750 // TODO: Handle ISD::TRUNCATE
40751 unsigned WideSizeInBits = RootSizeInBits;
40752 for (SDValue Input : Inputs) {
40754 while (1) {
40755 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40756 Input = peekThroughBitcasts(Input.getOperand(0));
40757 continue;
40758 }
40759 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40760 Input.getOperand(0).isUndef() &&
40761 isNullConstant(Input.getOperand(2))) {
40762 Input = peekThroughBitcasts(Input.getOperand(1));
40763 continue;
40764 }
40765 break;
40766 }
40767 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40768 WideSizeInBits < Input.getValueSizeInBits())
40769 WideSizeInBits = Input.getValueSizeInBits();
40770 }
40771
40772 // Bail if we fail to find a source larger than the existing root.
40773 if (WideSizeInBits <= RootSizeInBits ||
40774 (WideSizeInBits % RootSizeInBits) != 0)
40775 return SDValue();
40776
40777 // Create new mask for larger type.
40778 SmallVector<int, 64> WideMask;
40779 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40780
40781 // Attempt to peek through inputs and adjust mask when we extract from an
40782 // upper subvector.
40783 int AdjustedMasks = 0;
40784 SmallVector<SDValue, 4> WideInputs(Inputs);
40785 for (unsigned I = 0; I != NumInputs; ++I) {
40786 SDValue &Input = WideInputs[I];
40788 while (1) {
40789 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40790 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40791 uint64_t Idx = Input.getConstantOperandVal(1);
40792 if (Idx != 0) {
40793 ++AdjustedMasks;
40794 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40795 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40796
40797 int lo = I * WideMask.size();
40798 int hi = (I + 1) * WideMask.size();
40799 for (int &M : WideMask)
40800 if (lo <= M && M < hi)
40801 M += Idx;
40802 }
40803 Input = peekThroughBitcasts(Input.getOperand(0));
40804 continue;
40805 }
40806 // TODO: Handle insertions into upper subvectors.
40807 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40808 Input.getOperand(0).isUndef() &&
40809 isNullConstant(Input.getOperand(2))) {
40810 Input = peekThroughBitcasts(Input.getOperand(1));
40811 continue;
40812 }
40813 break;
40814 }
40815 }
40816
40817 // Remove unused/repeated shuffle source ops.
40818 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40819 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40820
40821 // Bail if we're always extracting from the lowest subvectors,
40822 // combineX86ShuffleChain should match this for the current width, or the
40823 // shuffle still references too many inputs.
40824 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40825 return SDValue();
40826
40827 // Minor canonicalization of the accumulated shuffle mask to make it easier
40828 // to match below. All this does is detect masks with sequential pairs of
40829 // elements, and shrink them to the half-width mask. It does this in a loop
40830 // so it will reduce the size of the mask to the minimal width mask which
40831 // performs an equivalent shuffle.
40832 while (WideMask.size() > 1) {
40833 SmallVector<int, 64> WidenedMask;
40834 if (!canWidenShuffleElements(WideMask, WidenedMask))
40835 break;
40836 WideMask = std::move(WidenedMask);
40837 }
40838
40839 // Canonicalization of binary shuffle masks to improve pattern matching by
40840 // commuting the inputs.
40841 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40843 std::swap(WideInputs[0], WideInputs[1]);
40844 }
40845
40846 // Increase depth for every upper subvector we've peeked through.
40847 Depth += AdjustedMasks;
40848
40849 // Attempt to combine wider chain.
40850 // TODO: Can we use a better Root?
40851 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40852 WideInputs.back().getValueSizeInBits()
40853 ? WideInputs.front()
40854 : WideInputs.back();
40855 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40856 "WideRootSize mismatch");
40857
40858 if (SDValue WideShuffle = combineX86ShuffleChain(
40859 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40860 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40861 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40862 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40863 return DAG.getBitcast(RootVT, WideShuffle);
40864 }
40865
40866 return SDValue();
40867}
40868
40869// Canonicalize the combined shuffle mask chain with horizontal ops.
40870// NOTE: This may update the Ops and Mask.
40873 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40874 const X86Subtarget &Subtarget) {
40875 if (Mask.empty() || Ops.empty())
40876 return SDValue();
40877
40879 for (SDValue Op : Ops)
40881
40882 // All ops must be the same horizop + type.
40883 SDValue BC0 = BC[0];
40884 EVT VT0 = BC0.getValueType();
40885 unsigned Opcode0 = BC0.getOpcode();
40886 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40887 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40888 }))
40889 return SDValue();
40890
40891 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40892 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40893 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40894 if (!isHoriz && !isPack)
40895 return SDValue();
40896
40897 // Do all ops have a single use?
40898 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40899 return Op.hasOneUse() &&
40901 });
40902
40903 int NumElts = VT0.getVectorNumElements();
40904 int NumLanes = VT0.getSizeInBits() / 128;
40905 int NumEltsPerLane = NumElts / NumLanes;
40906 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40907 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40908 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40909
40910 if (NumEltsPerLane >= 4 &&
40911 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40912 SmallVector<int> LaneMask, ScaledMask;
40913 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40914 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40915 // See if we can remove the shuffle by resorting the HOP chain so that
40916 // the HOP args are pre-shuffled.
40917 // TODO: Generalize to any sized/depth chain.
40918 // TODO: Add support for PACKSS/PACKUS.
40919 if (isHoriz) {
40920 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40921 auto GetHOpSrc = [&](int M) {
40922 if (M == SM_SentinelUndef)
40923 return DAG.getUNDEF(VT0);
40924 if (M == SM_SentinelZero)
40925 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40926 SDValue Src0 = BC[M / 4];
40927 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40928 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40929 return Src1.getOperand(M % 2);
40930 return SDValue();
40931 };
40932 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40933 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40934 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40935 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40936 if (M0 && M1 && M2 && M3) {
40937 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40938 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40939 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40940 }
40941 }
40942 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40943 if (Ops.size() >= 2) {
40944 SDValue LHS, RHS;
40945 auto GetHOpSrc = [&](int M, int &OutM) {
40946 // TODO: Support SM_SentinelZero
40947 if (M < 0)
40948 return M == SM_SentinelUndef;
40949 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40950 if (!LHS || LHS == Src) {
40951 LHS = Src;
40952 OutM = (M % 2);
40953 return true;
40954 }
40955 if (!RHS || RHS == Src) {
40956 RHS = Src;
40957 OutM = (M % 2) + 2;
40958 return true;
40959 }
40960 return false;
40961 };
40962 int PostMask[4] = {-1, -1, -1, -1};
40963 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40964 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40965 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40966 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40967 LHS = DAG.getBitcast(SrcVT, LHS);
40968 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40969 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40970 // Use SHUFPS for the permute so this will work on SSE2 targets,
40971 // shuffle combining and domain handling will simplify this later on.
40972 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40973 Res = DAG.getBitcast(ShuffleVT, Res);
40974 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40975 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40976 }
40977 }
40978 }
40979 }
40980
40981 if (2 < Ops.size())
40982 return SDValue();
40983
40984 SDValue BC1 = BC[BC.size() - 1];
40985 if (Mask.size() == VT0.getVectorNumElements()) {
40986 // Canonicalize binary shuffles of horizontal ops that use the
40987 // same sources to an unary shuffle.
40988 // TODO: Try to perform this fold even if the shuffle remains.
40989 if (Ops.size() == 2) {
40990 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40991 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40992 };
40993 // Commute if all BC0's ops are contained in BC1.
40994 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40995 ContainsOps(BC1, BC0.getOperand(1))) {
40997 std::swap(Ops[0], Ops[1]);
40998 std::swap(BC0, BC1);
40999 }
41000
41001 // If BC1 can be represented by BC0, then convert to unary shuffle.
41002 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41003 ContainsOps(BC0, BC1.getOperand(1))) {
41004 for (int &M : Mask) {
41005 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41006 continue;
41007 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41008 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41009 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41010 M += NumHalfEltsPerLane;
41011 }
41012 }
41013 }
41014
41015 // Canonicalize unary horizontal ops to only refer to lower halves.
41016 for (int i = 0; i != NumElts; ++i) {
41017 int &M = Mask[i];
41018 if (isUndefOrZero(M))
41019 continue;
41020 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41021 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41022 M -= NumHalfEltsPerLane;
41023 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41024 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41025 M -= NumHalfEltsPerLane;
41026 }
41027 }
41028
41029 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41030 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41031 // represents the LHS/RHS inputs for the lower/upper halves.
41032 SmallVector<int, 16> TargetMask128, WideMask128;
41033 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41034 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41035 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41036 bool SingleOp = (Ops.size() == 1);
41037 if (isPack || OneUseOps ||
41038 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41039 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41040 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41041 Lo = Lo.getOperand(WideMask128[0] & 1);
41042 Hi = Hi.getOperand(WideMask128[1] & 1);
41043 if (SingleOp) {
41044 SDValue Undef = DAG.getUNDEF(SrcVT);
41045 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41046 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41047 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41048 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41049 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41050 }
41051 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41052 }
41053 }
41054
41055 // If we are post-shuffling a 256-bit hop and not requiring the upper
41056 // elements, then try to narrow to a 128-bit hop directly.
41057 SmallVector<int, 16> WideMask64;
41058 if (Ops.size() == 1 && NumLanes == 2 &&
41059 scaleShuffleElements(Mask, 4, WideMask64) &&
41060 isUndefInRange(WideMask64, 2, 2)) {
41061 int M0 = WideMask64[0];
41062 int M1 = WideMask64[1];
41063 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41065 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41066 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41067 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41068 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41069 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41070 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41071 }
41072 }
41073
41074 return SDValue();
41075}
41076
41077// Attempt to constant fold all of the constant source ops.
41078// Returns true if the entire shuffle is folded to a constant.
41079// TODO: Extend this to merge multiple constant Ops and update the mask.
41081 ArrayRef<int> Mask,
41082 ArrayRef<const SDNode *> SrcNodes,
41083 SelectionDAG &DAG, const SDLoc &DL,
41084 const X86Subtarget &Subtarget) {
41085 unsigned SizeInBits = VT.getSizeInBits();
41086 unsigned NumMaskElts = Mask.size();
41087 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41088 unsigned NumOps = Ops.size();
41089
41090 // Extract constant bits from each source op.
41091 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41093 for (unsigned I = 0; I != NumOps; ++I)
41094 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41095 RawBitsOps[I],
41096 /*AllowWholeUndefs*/ true,
41097 /*AllowPartialUndefs*/ true))
41098 return SDValue();
41099
41100 // If we're optimizing for size, only fold if at least one of the constants is
41101 // only used once or the combined shuffle has included a variable mask
41102 // shuffle, this is to avoid constant pool bloat.
41103 bool IsOptimizingSize = DAG.shouldOptForSize();
41104 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41105 return isTargetShuffleVariableMask(N->getOpcode());
41106 });
41107 if (IsOptimizingSize && !HasVariableMask &&
41108 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41109 return SDValue();
41110
41111 // Shuffle the constant bits according to the mask.
41112 APInt UndefElts(NumMaskElts, 0);
41113 APInt ZeroElts(NumMaskElts, 0);
41114 APInt ConstantElts(NumMaskElts, 0);
41115 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41116 APInt::getZero(MaskSizeInBits));
41117 for (unsigned i = 0; i != NumMaskElts; ++i) {
41118 int M = Mask[i];
41119 if (M == SM_SentinelUndef) {
41120 UndefElts.setBit(i);
41121 continue;
41122 } else if (M == SM_SentinelZero) {
41123 ZeroElts.setBit(i);
41124 continue;
41125 }
41126 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41127
41128 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41129 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41130
41131 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41132 if (SrcUndefElts[SrcMaskIdx]) {
41133 UndefElts.setBit(i);
41134 continue;
41135 }
41136
41137 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41138 APInt &Bits = SrcEltBits[SrcMaskIdx];
41139 if (!Bits) {
41140 ZeroElts.setBit(i);
41141 continue;
41142 }
41143
41144 ConstantElts.setBit(i);
41145 ConstantBitData[i] = Bits;
41146 }
41147 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41148
41149 // Attempt to create a zero vector.
41150 if ((UndefElts | ZeroElts).isAllOnes())
41151 return getZeroVector(VT, Subtarget, DAG, DL);
41152
41153 // Create the constant data.
41154 MVT MaskSVT;
41155 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41156 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41157 else
41158 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41159
41160 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41161 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41162 return SDValue();
41163
41164 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41165 return DAG.getBitcast(VT, CstOp);
41166}
41167
41168namespace llvm {
41169 namespace X86 {
41170 enum {
41172 };
41173 } // namespace X86
41174} // namespace llvm
41175
41176/// Fully generic combining of x86 shuffle instructions.
41177///
41178/// This should be the last combine run over the x86 shuffle instructions. Once
41179/// they have been fully optimized, this will recursively consider all chains
41180/// of single-use shuffle instructions, build a generic model of the cumulative
41181/// shuffle operation, and check for simpler instructions which implement this
41182/// operation. We use this primarily for two purposes:
41183///
41184/// 1) Collapse generic shuffles to specialized single instructions when
41185/// equivalent. In most cases, this is just an encoding size win, but
41186/// sometimes we will collapse multiple generic shuffles into a single
41187/// special-purpose shuffle.
41188/// 2) Look for sequences of shuffle instructions with 3 or more total
41189/// instructions, and replace them with the slightly more expensive SSSE3
41190/// PSHUFB instruction if available. We do this as the last combining step
41191/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41192/// a suitable short sequence of other instructions. The PSHUFB will either
41193/// use a register or have to read from memory and so is slightly (but only
41194/// slightly) more expensive than the other shuffle instructions.
41195///
41196/// Because this is inherently a quadratic operation (for each shuffle in
41197/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41198/// This should never be an issue in practice as the shuffle lowering doesn't
41199/// produce sequences of more than 8 instructions.
41200///
41201/// FIXME: We will currently miss some cases where the redundant shuffling
41202/// would simplify under the threshold for PSHUFB formation because of
41203/// combine-ordering. To fix this, we should do the redundant instruction
41204/// combining in this recursive walk.
41206 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41207 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41208 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41209 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41210 const SDLoc &DL, const X86Subtarget &Subtarget) {
41211 assert(!RootMask.empty() &&
41212 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41213 "Illegal shuffle root mask");
41214 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41215 unsigned RootSizeInBits = RootVT.getSizeInBits();
41216 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41217
41218 // Bound the depth of our recursive combine because this is ultimately
41219 // quadratic in nature.
41220 if (Depth >= MaxDepth)
41221 return SDValue();
41222
41223 // Directly rip through bitcasts to find the underlying operand.
41224 SDValue Op = SrcOps[SrcOpIndex];
41226
41227 EVT VT = Op.getValueType();
41228 if (!VT.isVector() || !VT.isSimple())
41229 return SDValue(); // Bail if we hit a non-simple non-vector.
41230
41231 // FIXME: Just bail on f16 for now.
41232 if (VT.getVectorElementType() == MVT::f16)
41233 return SDValue();
41234
41235 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41236 "Can only combine shuffles upto size of the root op.");
41237
41238 // Create a demanded elts mask from the referenced elements of Op.
41239 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41240 for (int M : RootMask) {
41241 int BaseIdx = RootMask.size() * SrcOpIndex;
41242 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41243 OpDemandedElts.setBit(M - BaseIdx);
41244 }
41245 if (RootSizeInBits != VT.getSizeInBits()) {
41246 // Op is smaller than Root - extract the demanded elts for the subvector.
41247 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41248 unsigned NumOpMaskElts = RootMask.size() / Scale;
41249 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41250 assert(OpDemandedElts
41251 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41252 .isZero() &&
41253 "Out of range elements referenced in root mask");
41254 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41255 }
41256 OpDemandedElts =
41257 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41258
41259 // Extract target shuffle mask and resolve sentinels and inputs.
41260 SmallVector<int, 64> OpMask;
41261 SmallVector<SDValue, 2> OpInputs;
41262 APInt OpUndef, OpZero;
41263 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41264 OpZero, DAG, Depth, false)) {
41265 // Shuffle inputs must not be larger than the shuffle result.
41266 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41267 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41268 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41269 }))
41270 return SDValue();
41271 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41272 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41273 !isNullConstant(Op.getOperand(1))) {
41274 SDValue SrcVec = Op.getOperand(0);
41275 int ExtractIdx = Op.getConstantOperandVal(1);
41276 unsigned NumElts = VT.getVectorNumElements();
41277 OpInputs.assign({SrcVec});
41278 OpMask.assign(NumElts, SM_SentinelUndef);
41279 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41280 OpZero = OpUndef = APInt::getZero(NumElts);
41281 } else {
41282 return SDValue();
41283 }
41284
41285 // If the shuffle result was smaller than the root, we need to adjust the
41286 // mask indices and pad the mask with undefs.
41287 if (RootSizeInBits > VT.getSizeInBits()) {
41288 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41289 unsigned OpMaskSize = OpMask.size();
41290 if (OpInputs.size() > 1) {
41291 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41292 for (int &M : OpMask) {
41293 if (M < 0)
41294 continue;
41295 int EltIdx = M % OpMaskSize;
41296 int OpIdx = M / OpMaskSize;
41297 M = (PaddedMaskSize * OpIdx) + EltIdx;
41298 }
41299 }
41300 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41301 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41302 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41303 }
41304
41307
41308 // We don't need to merge masks if the root is empty.
41309 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41310 if (EmptyRoot) {
41311 // Only resolve zeros if it will remove an input, otherwise we might end
41312 // up in an infinite loop.
41313 bool ResolveKnownZeros = true;
41314 if (!OpZero.isZero()) {
41315 APInt UsedInputs = APInt::getZero(OpInputs.size());
41316 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41317 int M = OpMask[i];
41318 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41319 continue;
41320 UsedInputs.setBit(M / OpMask.size());
41321 if (UsedInputs.isAllOnes()) {
41322 ResolveKnownZeros = false;
41323 break;
41324 }
41325 }
41326 }
41327 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41328 ResolveKnownZeros);
41329
41330 Mask = OpMask;
41331 Ops.append(OpInputs.begin(), OpInputs.end());
41332 } else {
41333 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41334
41335 // Add the inputs to the Ops list, avoiding duplicates.
41336 Ops.append(SrcOps.begin(), SrcOps.end());
41337
41338 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41339 // Attempt to find an existing match.
41341 for (int i = 0, e = Ops.size(); i < e; ++i)
41342 if (InputBC == peekThroughBitcasts(Ops[i]))
41343 return i;
41344 // Match failed - should we replace an existing Op?
41345 if (InsertionPoint >= 0) {
41347 return InsertionPoint;
41348 }
41349 // Add to the end of the Ops list.
41350 Ops.push_back(Input);
41351 return Ops.size() - 1;
41352 };
41353
41354 SmallVector<int, 2> OpInputIdx;
41355 for (SDValue OpInput : OpInputs)
41356 OpInputIdx.push_back(
41357 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41358
41359 assert(((RootMask.size() > OpMask.size() &&
41360 RootMask.size() % OpMask.size() == 0) ||
41361 (OpMask.size() > RootMask.size() &&
41362 OpMask.size() % RootMask.size() == 0) ||
41363 OpMask.size() == RootMask.size()) &&
41364 "The smaller number of elements must divide the larger.");
41365
41366 // This function can be performance-critical, so we rely on the power-of-2
41367 // knowledge that we have about the mask sizes to replace div/rem ops with
41368 // bit-masks and shifts.
41370 "Non-power-of-2 shuffle mask sizes");
41372 "Non-power-of-2 shuffle mask sizes");
41373 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41374 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41375
41376 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41377 unsigned RootRatio =
41378 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41379 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41380 assert((RootRatio == 1 || OpRatio == 1) &&
41381 "Must not have a ratio for both incoming and op masks!");
41382
41383 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41384 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41385 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41386 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41387 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41388
41389 Mask.resize(MaskWidth, SM_SentinelUndef);
41390
41391 // Merge this shuffle operation's mask into our accumulated mask. Note that
41392 // this shuffle's mask will be the first applied to the input, followed by
41393 // the root mask to get us all the way to the root value arrangement. The
41394 // reason for this order is that we are recursing up the operation chain.
41395 for (unsigned i = 0; i < MaskWidth; ++i) {
41396 unsigned RootIdx = i >> RootRatioLog2;
41397 if (RootMask[RootIdx] < 0) {
41398 // This is a zero or undef lane, we're done.
41399 Mask[i] = RootMask[RootIdx];
41400 continue;
41401 }
41402
41403 unsigned RootMaskedIdx =
41404 RootRatio == 1
41405 ? RootMask[RootIdx]
41406 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41407
41408 // Just insert the scaled root mask value if it references an input other
41409 // than the SrcOp we're currently inserting.
41410 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41411 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41412 Mask[i] = RootMaskedIdx;
41413 continue;
41414 }
41415
41416 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41417 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41418 if (OpMask[OpIdx] < 0) {
41419 // The incoming lanes are zero or undef, it doesn't matter which ones we
41420 // are using.
41421 Mask[i] = OpMask[OpIdx];
41422 continue;
41423 }
41424
41425 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41426 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41427 : (OpMask[OpIdx] << OpRatioLog2) +
41428 (RootMaskedIdx & (OpRatio - 1));
41429
41430 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41431 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41432 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41433 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41434
41435 Mask[i] = OpMaskedIdx;
41436 }
41437 }
41438
41439 // Peek through any free bitcasts to insert_subvector vector widenings or
41440 // extract_subvector nodes back to root size.
41441 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41442 for (auto [I, Op] : enumerate(Ops)) {
41443 SDValue BC = Op;
41444 while (1) {
41445 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41446 BC = BC.getOperand(0);
41447 continue;
41448 }
41449 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41450 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41451 // Set out of bounds mask indices to undef.
41452 Op = BC = BC.getOperand(1);
41453 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41454 int Lo = I * Mask.size();
41455 int Hi = (I + 1) * Mask.size();
41456 int NewHi = Lo + (Mask.size() / Scale);
41457 for (int &M : Mask) {
41458 if (Lo <= M && NewHi <= M && M < Hi)
41459 M = SM_SentinelUndef;
41460 }
41461 continue;
41462 }
41463 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41464 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41465 isNullConstant(BC.getOperand(1))) {
41466 Op = BC = BC.getOperand(0);
41467 continue;
41468 }
41469 break;
41470 }
41471 }
41472
41473 // Remove unused/repeated shuffle source ops.
41475
41476 // Handle the all undef/zero/ones cases early.
41477 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41478 return DAG.getUNDEF(RootVT);
41479 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41480 return getZeroVector(RootVT, Subtarget, DAG, DL);
41481 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41483 return getOnesVector(RootVT, DAG, DL);
41484
41485 assert(!Ops.empty() && "Shuffle with no inputs detected");
41486
41487 // Update the list of shuffle nodes that have been combined so far.
41488 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41489 CombinedNodes.push_back(Op.getNode());
41490
41491 // See if we can recurse into each shuffle source op (if it's a target
41492 // shuffle). The source op should only be generally combined if it either has
41493 // a single use (i.e. current Op) or all its users have already been combined,
41494 // if not then we can still combine but should prevent generation of variable
41495 // shuffles to avoid constant pool bloat.
41496 // Don't recurse if we already have more source ops than we can combine in
41497 // the remaining recursion depth.
41498 if (Ops.size() < (MaxDepth - Depth)) {
41499 for (int i = 0, e = Ops.size(); i < e; ++i) {
41500 // For empty roots, we need to resolve zeroable elements before combining
41501 // them with other shuffles.
41502 SmallVector<int, 64> ResolvedMask = Mask;
41503 if (EmptyRoot)
41504 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41505 bool AllowCrossLaneVar = false;
41506 bool AllowPerLaneVar = false;
41507 if (Ops[i].getNode()->hasOneUse() ||
41508 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41509 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41510 AllowPerLaneVar = AllowVariablePerLaneMask;
41511 }
41513 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41514 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41515 DAG, DL, Subtarget))
41516 return Res;
41517 }
41518 }
41519
41520 // Attempt to constant fold all of the constant source ops.
41522 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41523 return Cst;
41524
41525 // If constant fold failed and we only have constants - then we have
41526 // multiple uses by a single non-variable shuffle - just bail.
41527 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41528 APInt UndefElts;
41529 SmallVector<APInt> RawBits;
41530 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41531 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41532 RawBits,
41533 /*AllowWholeUndefs*/ true,
41534 /*AllowPartialUndefs*/ true);
41535 })) {
41536 return SDValue();
41537 }
41538
41539 // Canonicalize the combined shuffle mask chain with horizontal ops.
41540 // NOTE: This will update the Ops and Mask.
41542 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41543 return DAG.getBitcast(RootVT, HOp);
41544
41545 // Try to refine our inputs given our knowledge of target shuffle mask.
41546 for (auto I : enumerate(Ops)) {
41547 int OpIdx = I.index();
41548 SDValue &Op = I.value();
41549
41550 // What range of shuffle mask element values results in picking from Op?
41551 int Lo = OpIdx * Mask.size();
41552 int Hi = Lo + Mask.size();
41553
41554 // Which elements of Op do we demand, given the mask's granularity?
41555 APInt OpDemandedElts(Mask.size(), 0);
41556 for (int MaskElt : Mask) {
41557 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41558 int OpEltIdx = MaskElt - Lo;
41559 OpDemandedElts.setBit(OpEltIdx);
41560 }
41561 }
41562
41563 // Is the shuffle result smaller than the root?
41564 if (Op.getValueSizeInBits() < RootSizeInBits) {
41565 // We padded the mask with undefs. But we now need to undo that.
41566 unsigned NumExpectedVectorElts = Mask.size();
41567 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41568 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41569 assert(!OpDemandedElts.extractBits(
41570 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41571 "Demanding the virtual undef widening padding?");
41572 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41573 }
41574
41575 // The Op itself may be of different VT, so we need to scale the mask.
41576 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41577 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41578
41579 // Can this operand be simplified any further, given it's demanded elements?
41581 Op, OpScaledDemandedElts, DAG))
41582 Op = NewOp;
41583 }
41584 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41585
41586 // Widen any subvector shuffle inputs we've collected.
41587 // TODO: Remove this to avoid generating temporary nodes, we should only
41588 // widen once combineX86ShuffleChain has found a match.
41589 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41590 return Op.getValueSizeInBits() < RootSizeInBits;
41591 })) {
41592 for (SDValue &Op : Ops)
41593 if (Op.getValueSizeInBits() < RootSizeInBits)
41594 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41595 RootSizeInBits);
41596 // Reresolve - we might have repeated subvector sources.
41598 }
41599
41600 // Handle the all undef/zero/ones cases.
41601 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41602 return DAG.getUNDEF(RootVT);
41603 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41604 return getZeroVector(RootVT, Subtarget, DAG, DL);
41605 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41607 return getOnesVector(RootVT, DAG, DL);
41608
41609 assert(!Ops.empty() && "Shuffle with no inputs detected");
41610
41611 // We can only combine unary and binary shuffle mask cases.
41612 if (Ops.size() <= 2) {
41613 // Minor canonicalization of the accumulated shuffle mask to make it easier
41614 // to match below. All this does is detect masks with sequential pairs of
41615 // elements, and shrink them to the half-width mask. It does this in a loop
41616 // so it will reduce the size of the mask to the minimal width mask which
41617 // performs an equivalent shuffle.
41618 while (Mask.size() > 1) {
41619 SmallVector<int, 64> WidenedMask;
41620 if (!canWidenShuffleElements(Mask, WidenedMask))
41621 break;
41622 Mask = std::move(WidenedMask);
41623 }
41624
41625 // Canonicalization of binary shuffle masks to improve pattern matching by
41626 // commuting the inputs.
41627 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41629 std::swap(Ops[0], Ops[1]);
41630 }
41631
41632 // Try to combine into a single shuffle instruction.
41633 if (SDValue Shuffle = combineX86ShuffleChain(
41634 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41635 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41636 IsMaskedShuffle, DAG, DL, Subtarget))
41637 return Shuffle;
41638
41639 // If all the operands come from the same larger vector, fallthrough and try
41640 // to use combineX86ShuffleChainWithExtract.
41643 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41644 (RootSizeInBits / Mask.size()) != 64 ||
41645 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41646 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41647 LHS.getOperand(0) != RHS.getOperand(0))
41648 return SDValue();
41649 }
41650
41651 // If that failed and any input is extracted then try to combine as a
41652 // shuffle with the larger type.
41654 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41655 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41656 DAG, DL, Subtarget);
41657}
41658
41659/// Helper entry wrapper to combineX86ShufflesRecursively.
41661 const X86Subtarget &Subtarget) {
41663 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41664 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41665 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41666 SDLoc(Op), Subtarget);
41667}
41668
41669/// Get the PSHUF-style mask from PSHUF node.
41670///
41671/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41672/// PSHUF-style masks that can be reused with such instructions.
41674 MVT VT = N.getSimpleValueType();
41677 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41678 (void)HaveMask;
41679 assert(HaveMask);
41680
41681 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41682 // matter. Check that the upper masks are repeats and remove them.
41683 if (VT.getSizeInBits() > 128) {
41684 int LaneElts = 128 / VT.getScalarSizeInBits();
41685#ifndef NDEBUG
41686 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41687 for (int j = 0; j < LaneElts; ++j)
41688 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41689 "Mask doesn't repeat in high 128-bit lanes!");
41690#endif
41691 Mask.resize(LaneElts);
41692 }
41693
41694 switch (N.getOpcode()) {
41695 case X86ISD::PSHUFD:
41696 return Mask;
41697 case X86ISD::PSHUFLW:
41698 Mask.resize(4);
41699 return Mask;
41700 case X86ISD::PSHUFHW:
41701 Mask.erase(Mask.begin(), Mask.begin() + 4);
41702 for (int &M : Mask)
41703 M -= 4;
41704 return Mask;
41705 default:
41706 llvm_unreachable("No valid shuffle instruction found!");
41707 }
41708}
41709
41710/// Get the expanded blend mask from a BLENDI node.
41711/// For v16i16 nodes, this will splat the repeated i8 mask.
41713 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41714 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41715 APInt Mask = V.getConstantOperandAPInt(2);
41716 if (Mask.getBitWidth() > NumElts)
41717 Mask = Mask.trunc(NumElts);
41718 if (NumElts == 16) {
41719 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41720 Mask = APInt::getSplat(16, Mask);
41721 }
41722 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41723 return Mask;
41724}
41725
41726/// Search for a combinable shuffle across a chain ending in pshufd.
41727///
41728/// We walk up the chain and look for a combinable shuffle, skipping over
41729/// shuffles that we could hoist this shuffle's transformation past without
41730/// altering anything.
41733 const SDLoc &DL,
41734 SelectionDAG &DAG) {
41735 assert(N.getOpcode() == X86ISD::PSHUFD &&
41736 "Called with something other than an x86 128-bit half shuffle!");
41737
41738 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41739 // of the shuffles in the chain so that we can form a fresh chain to replace
41740 // this one.
41742 SDValue V = N.getOperand(0);
41743 for (; V.hasOneUse(); V = V.getOperand(0)) {
41744 switch (V.getOpcode()) {
41745 default:
41746 return SDValue(); // Nothing combined!
41747
41748 case ISD::BITCAST:
41749 // Skip bitcasts as we always know the type for the target specific
41750 // instructions.
41751 continue;
41752
41753 case X86ISD::PSHUFD:
41754 // Found another dword shuffle.
41755 break;
41756
41757 case X86ISD::PSHUFLW:
41758 // Check that the low words (being shuffled) are the identity in the
41759 // dword shuffle, and the high words are self-contained.
41760 if (Mask[0] != 0 || Mask[1] != 1 ||
41761 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41762 return SDValue();
41763
41764 Chain.push_back(V);
41765 continue;
41766
41767 case X86ISD::PSHUFHW:
41768 // Check that the high words (being shuffled) are the identity in the
41769 // dword shuffle, and the low words are self-contained.
41770 if (Mask[2] != 2 || Mask[3] != 3 ||
41771 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41772 return SDValue();
41773
41774 Chain.push_back(V);
41775 continue;
41776
41777 case X86ISD::UNPCKL:
41778 case X86ISD::UNPCKH:
41779 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41780 // shuffle into a preceding word shuffle.
41781 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41782 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41783 return SDValue();
41784
41785 // Search for a half-shuffle which we can combine with.
41786 unsigned CombineOp =
41787 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41788 if (V.getOperand(0) != V.getOperand(1) ||
41789 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41790 return SDValue();
41791 Chain.push_back(V);
41792 V = V.getOperand(0);
41793 do {
41794 switch (V.getOpcode()) {
41795 default:
41796 return SDValue(); // Nothing to combine.
41797
41798 case X86ISD::PSHUFLW:
41799 case X86ISD::PSHUFHW:
41800 if (V.getOpcode() == CombineOp)
41801 break;
41802
41803 Chain.push_back(V);
41804
41805 [[fallthrough]];
41806 case ISD::BITCAST:
41807 V = V.getOperand(0);
41808 continue;
41809 }
41810 break;
41811 } while (V.hasOneUse());
41812 break;
41813 }
41814 // Break out of the loop if we break out of the switch.
41815 break;
41816 }
41817
41818 if (!V.hasOneUse())
41819 // We fell out of the loop without finding a viable combining instruction.
41820 return SDValue();
41821
41822 // Merge this node's mask and our incoming mask.
41824 for (int &M : Mask)
41825 M = VMask[M];
41826 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41827 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41828
41829 // Rebuild the chain around this new shuffle.
41830 while (!Chain.empty()) {
41831 SDValue W = Chain.pop_back_val();
41832
41833 if (V.getValueType() != W.getOperand(0).getValueType())
41834 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41835
41836 switch (W.getOpcode()) {
41837 default:
41838 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41839
41840 case X86ISD::UNPCKL:
41841 case X86ISD::UNPCKH:
41842 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41843 break;
41844
41845 case X86ISD::PSHUFD:
41846 case X86ISD::PSHUFLW:
41847 case X86ISD::PSHUFHW:
41848 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41849 break;
41850 }
41851 }
41852 if (V.getValueType() != N.getValueType())
41853 V = DAG.getBitcast(N.getValueType(), V);
41854
41855 // Return the new chain to replace N.
41856 return V;
41857}
41858
41859// Attempt to commute shufps LHS loads:
41860// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41862 SelectionDAG &DAG) {
41863 // TODO: Add vXf64 support.
41864 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41865 return SDValue();
41866
41867 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41868 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41869 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41870 return SDValue();
41871 SDValue N0 = V.getOperand(0);
41872 SDValue N1 = V.getOperand(1);
41873 unsigned Imm = V.getConstantOperandVal(2);
41874 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41875 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41877 return SDValue();
41878 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41879 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41880 DAG.getTargetConstant(Imm, DL, MVT::i8));
41881 };
41882
41883 switch (N.getOpcode()) {
41884 case X86ISD::VPERMILPI:
41885 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41886 unsigned Imm = N.getConstantOperandVal(1);
41887 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41888 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41889 }
41890 break;
41891 case X86ISD::SHUFP: {
41892 SDValue N0 = N.getOperand(0);
41893 SDValue N1 = N.getOperand(1);
41894 unsigned Imm = N.getConstantOperandVal(2);
41895 if (N0 == N1) {
41896 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41897 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41898 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41899 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41900 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41901 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41902 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41903 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41904 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41905 }
41906 break;
41907 }
41908 }
41909
41910 return SDValue();
41911}
41912
41913// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41914// iff we don't demand the same element index for both X and Y.
41915static SDValue
41917 const APInt &DemandedElts, SelectionDAG &DAG,
41918 const X86Subtarget &Subtarget, const SDLoc &DL) {
41919 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41920 if (!N0.hasOneUse() || !N1.hasOneUse())
41921 return SDValue();
41922
41923 unsigned NumElts = VT.getVectorNumElements();
41926
41927 // See if both operands are shuffles, and that we can scale the shuffle masks
41928 // to the same width as the blend mask.
41929 // TODO: Support SM_SentinelZero?
41930 SmallVector<SDValue, 2> Ops0, Ops1;
41931 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41932 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41933 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41934 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41935 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41936 return SDValue();
41937
41938 // Determine the demanded elts from both permutes.
41939 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41940 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41941 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41942 Demanded1,
41943 /*AllowUndefElts=*/true) ||
41944 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41945 DemandedRHS0, /*AllowUndefElts=*/true) ||
41946 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41947 DemandedRHS1, /*AllowUndefElts=*/true))
41948 return SDValue();
41949
41950 // Confirm that we only use a single operand from both permutes and that we
41951 // don't demand the same index from both.
41952 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41953 DemandedLHS0.intersects(DemandedLHS1))
41954 return SDValue();
41955
41956 // Use the permute demanded elts masks as the new blend mask.
41957 // Create the new permute mask as a blend of the 2 original permute masks.
41958 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41959 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41960 for (unsigned I = 0; I != NumElts; ++I) {
41961 if (Demanded0[I]) {
41962 int M = ScaledMask0[I];
41963 if (0 <= M) {
41964 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41965 "BlendMask demands LHS AND RHS");
41966 NewBlendMask[M] = M;
41967 NewPermuteMask[I] = M;
41968 }
41969 } else if (Demanded1[I]) {
41970 int M = ScaledMask1[I];
41971 if (0 <= M) {
41972 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41973 "BlendMask demands LHS AND RHS");
41974 NewBlendMask[M] = M + NumElts;
41975 NewPermuteMask[I] = M;
41976 }
41977 }
41978 }
41979 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41980 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41981
41982 // v16i16 shuffles can explode in complexity very easily, only accept them if
41983 // the blend mask is the same in the 128-bit subvectors (or can widen to
41984 // v8i32) and the permute can be widened as well.
41985 if (VT == MVT::v16i16) {
41986 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41987 !canWidenShuffleElements(NewBlendMask))
41988 return SDValue();
41989 if (!canWidenShuffleElements(NewPermuteMask))
41990 return SDValue();
41991 }
41992
41993 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41994 // widened to a lane permute (vperm2f128).
41995 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41997 NewPermuteMask) &&
41998 !canScaleShuffleElements(NewPermuteMask, 2))
41999 return SDValue();
42000
42001 SDValue NewBlend =
42002 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42003 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42004 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42005 NewPermuteMask);
42006}
42007
42008// TODO - move this to TLI like isBinOp?
42009static bool isUnaryOp(unsigned Opcode) {
42010 switch (Opcode) {
42011 case ISD::CTLZ:
42012 case ISD::CTTZ:
42013 case ISD::CTPOP:
42014 return true;
42015 }
42016 return false;
42017}
42018
42019// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42020// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42022 const SDLoc &DL) {
42023 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42024 EVT ShuffleVT = N.getValueType();
42025 unsigned Opc = N.getOpcode();
42026
42027 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42028 // AllZeros/AllOnes constants are freely shuffled and will peek through
42029 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42030 // merge with target shuffles if it has one use so shuffle combining is
42031 // likely to kick in. Shuffles of splats are expected to be removed.
42032 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42033 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42037 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42038 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42039 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42040 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42041 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42042 };
42043 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42044 // Ensure we only shuffle whole vector src elements, unless its a logical
42045 // binops where we can more aggressively move shuffles from dst to src.
42046 return isLogicOp(BinOp) ||
42047 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42048 };
42049
42050 switch (Opc) {
42051 // Unary and Unary+Permute Shuffles.
42052 case X86ISD::PSHUFB: {
42053 // Don't merge PSHUFB if it contains zero'd elements.
42054 SmallVector<int> Mask;
42056 if (!getTargetShuffleMask(N, false, Ops, Mask))
42057 break;
42058 [[fallthrough]];
42059 }
42060 case X86ISD::VBROADCAST:
42061 case X86ISD::MOVDDUP:
42062 case X86ISD::PSHUFD:
42063 case X86ISD::PSHUFHW:
42064 case X86ISD::PSHUFLW:
42065 case X86ISD::VPERMV:
42066 case X86ISD::VPERMI:
42067 case X86ISD::VPERMILPI: {
42068 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42069 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42070 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42071 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42072 unsigned SrcOpcode = N0.getOpcode();
42073 EVT OpVT = N0.getValueType();
42074 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42077 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42078 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42079 IsMergeableWithShuffle(Op01, FoldShuf)) {
42080 SDValue LHS, RHS;
42081 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42082 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42083 if (Opc == X86ISD::VPERMV) {
42084 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42085 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42086 } else if (N.getNumOperands() == 2) {
42087 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42088 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42089 } else {
42090 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42091 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42092 }
42093 return DAG.getBitcast(ShuffleVT,
42094 DAG.getNode(SrcOpcode, DL, OpVT,
42095 DAG.getBitcast(OpVT, LHS),
42096 DAG.getBitcast(OpVT, RHS)));
42097 }
42098 }
42099 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42100 OpVT.getScalarSizeInBits() ==
42102 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42103 if (Opc == X86ISD::VPERMV)
42104 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42105 else if (N.getNumOperands() == 2)
42106 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42107 else
42108 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42109 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42110 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42111 }
42112 }
42113 break;
42114 }
42115 // Binary and Binary+Permute Shuffles.
42116 case X86ISD::INSERTPS: {
42117 // Don't merge INSERTPS if it contains zero'd elements.
42118 unsigned InsertPSMask = N.getConstantOperandVal(2);
42119 unsigned ZeroMask = InsertPSMask & 0xF;
42120 if (ZeroMask != 0)
42121 break;
42122 [[fallthrough]];
42123 }
42124 case X86ISD::MOVSD:
42125 case X86ISD::MOVSS:
42126 case X86ISD::BLENDI:
42127 case X86ISD::SHUFP:
42128 case X86ISD::UNPCKH:
42129 case X86ISD::UNPCKL: {
42130 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42131 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42132 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42133 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42134 unsigned SrcOpcode = N0.getOpcode();
42135 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42136 N0.getValueType() == N1.getValueType() &&
42137 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42138 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42143 // Ensure the total number of shuffles doesn't increase by folding this
42144 // shuffle through to the source ops.
42145 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42146 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42147 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42148 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42149 SDValue LHS, RHS;
42150 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42151 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42152 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42153 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42154 if (N.getNumOperands() == 3) {
42155 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42156 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42157 } else {
42158 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42159 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42160 }
42161 EVT OpVT = N0.getValueType();
42162 return DAG.getBitcast(ShuffleVT,
42163 DAG.getNode(SrcOpcode, DL, OpVT,
42164 DAG.getBitcast(OpVT, LHS),
42165 DAG.getBitcast(OpVT, RHS)));
42166 }
42167 }
42168 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42169 N0.getValueType() == N1.getValueType() &&
42170 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42171 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42174 SDValue Res;
42175 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42176 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42177 if (N.getNumOperands() == 3) {
42178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42179 } else {
42180 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42181 }
42182 EVT OpVT = N0.getValueType();
42183 return DAG.getBitcast(
42184 ShuffleVT,
42185 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42186 }
42187 // TODO: We can generalize this for other shuffles/conversions.
42188 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42189 N1.getOpcode() == SrcOpcode &&
42190 N0.getValueType() == N1.getValueType() &&
42191 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42192 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42193 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42194 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42195 EVT OpSrcVT = N0.getOperand(0).getValueType();
42196 EVT OpDstVT = N0.getValueType();
42197 SDValue Res =
42198 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42199 return DAG.getBitcast(ShuffleVT,
42200 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42201 }
42202 }
42203 break;
42204 }
42205 }
42206 return SDValue();
42207}
42208
42209/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42211 SelectionDAG &DAG,
42212 const SDLoc &DL) {
42213 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42214
42215 MVT VT = V.getSimpleValueType();
42216 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42217 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42218 unsigned SrcOpc0 = Src0.getOpcode();
42219 unsigned SrcOpc1 = Src1.getOpcode();
42220 EVT SrcVT0 = Src0.getValueType();
42221 EVT SrcVT1 = Src1.getValueType();
42222
42223 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42224 return SDValue();
42225
42226 switch (SrcOpc0) {
42227 case X86ISD::MOVDDUP: {
42228 SDValue LHS = Src0.getOperand(0);
42229 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42230 SDValue Res =
42231 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42232 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42233 return DAG.getBitcast(VT, Res);
42234 }
42235 case X86ISD::VPERMILPI:
42236 // TODO: Handle v4f64 permutes with different low/high lane masks.
42237 if (SrcVT0 == MVT::v4f64) {
42238 uint64_t Mask = Src0.getConstantOperandVal(1);
42239 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42240 break;
42241 }
42242 [[fallthrough]];
42243 case X86ISD::VSHLI:
42244 case X86ISD::VSRLI:
42245 case X86ISD::VSRAI:
42246 case X86ISD::PSHUFD:
42247 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42248 SDValue LHS = Src0.getOperand(0);
42249 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42250 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42251 V.getOperand(2));
42252 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42253 return DAG.getBitcast(VT, Res);
42254 }
42255 break;
42256 }
42257
42258 return SDValue();
42259}
42260
42261/// Try to combine x86 target specific shuffles.
42263 SelectionDAG &DAG,
42265 const X86Subtarget &Subtarget) {
42266 using namespace SDPatternMatch;
42267
42268 MVT VT = N.getSimpleValueType();
42269 unsigned NumElts = VT.getVectorNumElements();
42271 unsigned Opcode = N.getOpcode();
42272 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42273
42274 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42275 return R;
42276
42277 // Handle specific target shuffles.
42278 switch (Opcode) {
42279 case X86ISD::MOVDDUP: {
42280 SDValue Src = N.getOperand(0);
42281 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42282 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42283 ISD::isNormalLoad(Src.getNode())) {
42284 LoadSDNode *LN = cast<LoadSDNode>(Src);
42285 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42286 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42287 DCI.CombineTo(N.getNode(), Movddup);
42288 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42290 return N; // Return N so it doesn't get rechecked!
42291 }
42292 }
42293
42294 return SDValue();
42295 }
42296 case X86ISD::VBROADCAST: {
42297 SDValue Src = N.getOperand(0);
42298 SDValue BC = peekThroughBitcasts(Src);
42299 EVT SrcVT = Src.getValueType();
42300 EVT BCVT = BC.getValueType();
42301
42302 // If broadcasting from another shuffle, attempt to simplify it.
42303 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42304 if (isTargetShuffle(BC.getOpcode()) &&
42305 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42306 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42307 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42309 for (unsigned i = 0; i != Scale; ++i)
42310 DemandedMask[i] = i;
42312 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42313 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42314 /*AllowVariableCrossLaneMask=*/true,
42315 /*AllowVariablePerLaneMask=*/true,
42316 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42317 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42318 DAG.getBitcast(SrcVT, Res));
42319 }
42320
42321 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42322 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42323 if (Src.getOpcode() == ISD::BITCAST &&
42324 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42325 TLI.isTypeLegal(BCVT) &&
42327 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42328 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42330 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42331 }
42332
42333 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42334 // If we're re-broadcasting a smaller type then broadcast with that type and
42335 // bitcast.
42336 // TODO: Do this for any splat?
42337 if (Src.getOpcode() == ISD::BITCAST &&
42338 (BC.getOpcode() == X86ISD::VBROADCAST ||
42340 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42341 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42342 MVT NewVT =
42344 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42345 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42346 }
42347
42348 // Reduce broadcast source vector to lowest 128-bits.
42349 if (SrcVT.getSizeInBits() > 128)
42350 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42351 extract128BitVector(Src, 0, DAG, DL));
42352
42353 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42354 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42355 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42356 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42357
42358 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42359 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42360 isNullConstant(Src.getOperand(1)) &&
42361 Src.getValueType() ==
42362 Src.getOperand(0).getValueType().getScalarType() &&
42363 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42364 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42365
42366 // Share broadcast with the longest vector and extract low subvector (free).
42367 // Ensure the same SDValue from the SDNode use is being used.
42368 for (SDNode *User : Src->users())
42369 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42370 Src == User->getOperand(0) &&
42371 User->getValueSizeInBits(0).getFixedValue() >
42372 VT.getFixedSizeInBits()) {
42373 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42374 VT.getSizeInBits());
42375 }
42376
42377 // vbroadcast(scalarload X) -> vbroadcast_load X
42378 // For float loads, extract other uses of the scalar from the broadcast.
42379 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42380 ISD::isNormalLoad(Src.getNode())) {
42381 LoadSDNode *LN = cast<LoadSDNode>(Src);
42382 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42383 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42384 SDValue BcastLd =
42386 LN->getMemoryVT(), LN->getMemOperand());
42387 // If the load value is used only by N, replace it via CombineTo N.
42388 bool NoReplaceExtract = Src.hasOneUse();
42389 DCI.CombineTo(N.getNode(), BcastLd);
42390 if (NoReplaceExtract) {
42391 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42393 } else {
42394 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42395 DAG.getVectorIdxConstant(0, DL));
42396 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42397 }
42398 return N; // Return N so it doesn't get rechecked!
42399 }
42400
42401 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42402 // i16. So shrink it ourselves if we can make a broadcast_load.
42403 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42404 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42405 assert(Subtarget.hasAVX2() && "Expected AVX2");
42406 SDValue TruncIn = Src.getOperand(0);
42407
42408 // If this is a truncate of a non extending load we can just narrow it to
42409 // use a broadcast_load.
42410 if (ISD::isNormalLoad(TruncIn.getNode())) {
42411 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42412 // Unless its volatile or atomic.
42413 if (LN->isSimple()) {
42414 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42415 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42416 SDValue BcastLd = DAG.getMemIntrinsicNode(
42417 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42418 LN->getPointerInfo(), LN->getBaseAlign(),
42419 LN->getMemOperand()->getFlags());
42420 DCI.CombineTo(N.getNode(), BcastLd);
42421 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42422 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42423 return N; // Return N so it doesn't get rechecked!
42424 }
42425 }
42426
42427 // If this is a truncate of an i16 extload, we can directly replace it.
42428 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42429 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42430 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42431 if (LN->getMemoryVT().getSizeInBits() == 16) {
42432 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42433 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42434 SDValue BcastLd =
42436 LN->getMemoryVT(), LN->getMemOperand());
42437 DCI.CombineTo(N.getNode(), BcastLd);
42438 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42439 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42440 return N; // Return N so it doesn't get rechecked!
42441 }
42442 }
42443
42444 // If this is a truncate of load that has been shifted right, we can
42445 // offset the pointer and use a narrower load.
42446 if (TruncIn.getOpcode() == ISD::SRL &&
42447 TruncIn.getOperand(0).hasOneUse() &&
42448 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42449 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42450 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42451 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42452 // Make sure the shift amount and the load size are divisible by 16.
42453 // Don't do this if the load is volatile or atomic.
42454 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42455 LN->isSimple()) {
42456 unsigned Offset = ShiftAmt / 8;
42457 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42460 SDValue Ops[] = { LN->getChain(), Ptr };
42461 SDValue BcastLd = DAG.getMemIntrinsicNode(
42462 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42464 LN->getMemOperand()->getFlags());
42465 DCI.CombineTo(N.getNode(), BcastLd);
42466 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42467 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42468 return N; // Return N so it doesn't get rechecked!
42469 }
42470 }
42471 }
42472
42473 // vbroadcast(vzload X) -> vbroadcast_load X
42474 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42476 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42477 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42478 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42479 SDValue BcastLd =
42481 LN->getMemoryVT(), LN->getMemOperand());
42482 DCI.CombineTo(N.getNode(), BcastLd);
42483 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42485 return N; // Return N so it doesn't get rechecked!
42486 }
42487 }
42488
42489 // vbroadcast(vector load X) -> vbroadcast_load
42490 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42491 LoadSDNode *LN = cast<LoadSDNode>(Src);
42492 // Unless the load is volatile or atomic.
42493 if (LN->isSimple()) {
42494 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42495 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42496 SDValue BcastLd = DAG.getMemIntrinsicNode(
42498 LN->getPointerInfo(), LN->getBaseAlign(),
42499 LN->getMemOperand()->getFlags());
42500 DCI.CombineTo(N.getNode(), BcastLd);
42501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42503 return N; // Return N so it doesn't get rechecked!
42504 }
42505 }
42506
42507 return SDValue();
42508 }
42509 case X86ISD::VZEXT_MOVL: {
42510 SDValue N0 = N.getOperand(0);
42511
42512 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42513 // Zeroing out the upper elements means we're just shifting a zero value.
42514 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42515 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42516 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42517 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42518 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42519 if (N0.hasOneUse())
42520 return DAG.getNode(
42521 N0.getOpcode(), DL, VT,
42522 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42523 N0.getOperand(1));
42524 }
42525
42526 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42527 // the load is volatile.
42528 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42529 auto *LN = cast<LoadSDNode>(N0);
42530 if (SDValue VZLoad =
42531 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42532 DCI.CombineTo(N.getNode(), VZLoad);
42533 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42535 return N;
42536 }
42537 }
42538
42539 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42540 // and can just use a VZEXT_LOAD.
42541 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42542 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42543 auto *LN = cast<MemSDNode>(N0);
42544 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42545 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42546 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42547 SDValue VZLoad =
42549 LN->getMemoryVT(), LN->getMemOperand());
42550 DCI.CombineTo(N.getNode(), VZLoad);
42551 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42553 return N;
42554 }
42555 }
42556
42557 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42558 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42559 // if the upper bits of the i64 are zero.
42560 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42561 N0.getOperand(0).hasOneUse() &&
42562 N0.getOperand(0).getValueType() == MVT::i64) {
42563 SDValue In = N0.getOperand(0);
42564 APInt Mask = APInt::getHighBitsSet(64, 32);
42565 if (DAG.MaskedValueIsZero(In, Mask)) {
42566 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42567 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42568 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42569 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42570 return DAG.getBitcast(VT, Movl);
42571 }
42572 }
42573
42574 // Load a scalar integer constant directly to XMM instead of transferring an
42575 // immediate value from GPR.
42576 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42577 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42578 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42579 // Create a vector constant - scalar constant followed by zeros.
42580 EVT ScalarVT = N0.getOperand(0).getValueType();
42581 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42582 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42583 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42584 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42585
42586 // Load the vector constant from constant pool.
42587 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42588 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42589 MachinePointerInfo MPI =
42591 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42592 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42594 }
42595 }
42596
42597 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42598 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42599 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42600 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42601 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42603
42604 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42605 isNullConstant(V.getOperand(2))) {
42606 SDValue In = V.getOperand(1);
42608 In.getValueSizeInBits() /
42609 VT.getScalarSizeInBits());
42610 In = DAG.getBitcast(SubVT, In);
42611 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42612 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42613 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42614 V.getOperand(2));
42615 }
42616 }
42617
42618 return SDValue();
42619 }
42620 case X86ISD::BLENDI: {
42621 SDValue N0 = N.getOperand(0);
42622 SDValue N1 = N.getOperand(1);
42623 unsigned EltBits = VT.getScalarSizeInBits();
42624
42625 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42626 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42627 // TODO: Handle MVT::v16i16 repeated blend mask.
42628 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42629 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42630 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42631 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42632 unsigned NewSize = SrcVT.getVectorNumElements();
42633 APInt BlendMask = getBLENDIBlendMask(N);
42634 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42635 return DAG.getBitcast(
42636 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42637 N1.getOperand(0),
42638 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42639 DL, MVT::i8)));
42640 }
42641 }
42642 // Share PSHUFB masks:
42643 // blend(pshufb(x,m1),pshufb(y,m2))
42644 // --> m3 = blend(m1,m2)
42645 // blend(pshufb(x,m3),pshufb(y,m3))
42646 if (N0.hasOneUse() && N1.hasOneUse()) {
42647 SmallVector<int> Mask, ByteMask;
42651 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42652 RHS.getOpcode() == X86ISD::PSHUFB &&
42653 LHS.getOperand(1) != RHS.getOperand(1) &&
42654 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42655 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42656 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42658 "BLENDI decode mismatch");
42659 MVT ShufVT = LHS.getSimpleValueType();
42660 SDValue MaskLHS = LHS.getOperand(1);
42661 SDValue MaskRHS = RHS.getOperand(1);
42662 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42664 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42665 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42666 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42667 LHS.getOperand(0), NewMask);
42668 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42669 RHS.getOperand(0), NewMask);
42670 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42671 DAG.getBitcast(VT, NewLHS),
42672 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42673 }
42674 }
42675 }
42676 }
42677 return SDValue();
42678 }
42679 case X86ISD::SHUFP: {
42680 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42681 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42682 // TODO: Support types other than v4f32.
42683 if (VT == MVT::v4f32) {
42684 bool Updated = false;
42685 SmallVector<int> Mask;
42687 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42688 for (int i = 0; i != 2; ++i) {
42689 SmallVector<SDValue> SubOps;
42690 SmallVector<int> SubMask, SubScaledMask;
42692 // TODO: Scaling might be easier if we specify the demanded elts.
42693 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42694 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42695 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42696 int Ofs = i * 2;
42697 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42698 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42699 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42700 Updated = true;
42701 }
42702 }
42703 }
42704 if (Updated) {
42705 for (int &M : Mask)
42706 M %= 4;
42707 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42708 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42709 }
42710 }
42711 return SDValue();
42712 }
42713 case X86ISD::VPERMI: {
42714 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42715 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42716 SDValue N0 = N.getOperand(0);
42717 SDValue N1 = N.getOperand(1);
42718 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42719 if (N0.getOpcode() == ISD::BITCAST &&
42720 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42721 SDValue Src = N0.getOperand(0);
42722 EVT SrcVT = Src.getValueType();
42723 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42724 return DAG.getBitcast(VT, Res);
42725 }
42726 return SDValue();
42727 }
42728 case X86ISD::SHUF128: {
42729 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42730 // see if we can peek through and access the subvector directly.
42731 if (VT.is512BitVector()) {
42732 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42733 // the upper subvector is used.
42734 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42735 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42736 uint64_t Mask = N->getConstantOperandVal(2);
42737 SmallVector<SDValue> LHSOps, RHSOps;
42738 SDValue NewLHS, NewRHS;
42739 if ((Mask & 0x0A) == 0x0A &&
42740 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42741 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42742 Mask &= ~0x0A;
42743 }
42744 if ((Mask & 0xA0) == 0xA0 &&
42745 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42746 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42747 Mask &= ~0xA0;
42748 }
42749 if (NewLHS || NewRHS)
42750 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42751 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42752 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42753 DAG.getTargetConstant(Mask, DL, MVT::i8));
42754 }
42755 return SDValue();
42756 }
42757 case X86ISD::VPERM2X128: {
42758 SDValue LHS = N->getOperand(0);
42759 SDValue RHS = N->getOperand(1);
42760 unsigned Imm = N.getConstantOperandVal(2) & 255;
42761
42762 // Canonicalize unary/repeated operands to LHS.
42763 if (LHS.isUndef() && !RHS.isUndef())
42764 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42765 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42766 if (LHS == RHS)
42767 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42768 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42769
42770 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42771 if (LHS.getOpcode() == ISD::BITCAST &&
42772 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42773 EVT SrcVT = LHS.getOperand(0).getValueType();
42774 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42775 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42776 DAG.getBitcast(SrcVT, LHS),
42777 DAG.getBitcast(SrcVT, RHS),
42778 N->getOperand(2)));
42779 }
42780 }
42781
42782 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42784 return Res;
42785
42786 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42787 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42788 auto FindSubVector128 = [&](unsigned Idx) {
42789 if (Idx > 3)
42790 return SDValue();
42791 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42792 SmallVector<SDValue> SubOps;
42793 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42794 return SubOps[Idx & 1];
42795 unsigned NumElts = Src.getValueType().getVectorNumElements();
42796 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42797 Src.getOperand(1).getValueSizeInBits() == 128 &&
42798 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42799 return Src.getOperand(1);
42800 }
42801 return SDValue();
42802 };
42803 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42804 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42805 MVT SubVT = VT.getHalfNumVectorElementsVT();
42806 SubLo = DAG.getBitcast(SubVT, SubLo);
42807 SubHi = DAG.getBitcast(SubVT, SubHi);
42808 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42809 }
42810 }
42811
42812 // Attempt to match VBROADCAST*128 subvector broadcast load.
42813 if (RHS.isUndef()) {
42815 DecodeVPERM2X128Mask(4, Imm, Mask);
42816 if (isUndefOrInRange(Mask, 0, 4)) {
42817 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42818 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42819 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42820 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42821 MVT MemVT = VT.getHalfNumVectorElementsVT();
42822 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42824 cast<LoadSDNode>(LHS), Ofs, DAG);
42825 }
42826 }
42827 }
42828
42829 return SDValue();
42830 }
42831 case X86ISD::PSHUFD:
42832 case X86ISD::PSHUFLW:
42833 case X86ISD::PSHUFHW: {
42834 SDValue N0 = N.getOperand(0);
42835 SDValue N1 = N.getOperand(1);
42836 if (N0->hasOneUse()) {
42838 switch (V.getOpcode()) {
42839 case X86ISD::VSHL:
42840 case X86ISD::VSRL:
42841 case X86ISD::VSRA:
42842 case X86ISD::VSHLI:
42843 case X86ISD::VSRLI:
42844 case X86ISD::VSRAI:
42845 case X86ISD::VROTLI:
42846 case X86ISD::VROTRI: {
42847 MVT InnerVT = V.getSimpleValueType();
42848 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42849 SDValue Res = DAG.getNode(Opcode, DL, VT,
42850 DAG.getBitcast(VT, V.getOperand(0)), N1);
42851 Res = DAG.getBitcast(InnerVT, Res);
42852 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42853 return DAG.getBitcast(VT, Res);
42854 }
42855 break;
42856 }
42857 }
42858 }
42859
42860 Mask = getPSHUFShuffleMask(N);
42861 assert(Mask.size() == 4);
42862 break;
42863 }
42864 case X86ISD::MOVSD:
42865 case X86ISD::MOVSH:
42866 case X86ISD::MOVSS: {
42867 SDValue N0 = N.getOperand(0);
42868 SDValue N1 = N.getOperand(1);
42869
42870 // Canonicalize scalar FPOps:
42871 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42872 // If commutable, allow OP(N1[0], N0[0]).
42873 unsigned Opcode1 = N1.getOpcode();
42874 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42875 Opcode1 == ISD::FDIV) {
42876 SDValue N10 = N1.getOperand(0);
42877 SDValue N11 = N1.getOperand(1);
42878 if (N10 == N0 ||
42879 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42880 if (N10 != N0)
42881 std::swap(N10, N11);
42882 MVT SVT = VT.getVectorElementType();
42883 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42884 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42885 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42886 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42887 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42888 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42889 }
42890 }
42891
42892 return SDValue();
42893 }
42894 case X86ISD::INSERTPS: {
42895 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42896 SDValue Op0 = N.getOperand(0);
42897 SDValue Op1 = N.getOperand(1);
42898 unsigned InsertPSMask = N.getConstantOperandVal(2);
42899 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42900 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42901 unsigned ZeroMask = InsertPSMask & 0xF;
42902
42903 // If we zero out all elements from Op0 then we don't need to reference it.
42904 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42905 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42906 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42907
42908 // If we zero out the element from Op1 then we don't need to reference it.
42909 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42910 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42911 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42912
42913 // Attempt to merge insertps Op1 with an inner target shuffle node.
42914 SmallVector<int, 8> TargetMask1;
42916 APInt KnownUndef1, KnownZero1;
42917 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42918 KnownZero1)) {
42919 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42920 // Zero/UNDEF insertion - zero out element and remove dependency.
42921 InsertPSMask |= (1u << DstIdx);
42922 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42923 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42924 }
42925 // Update insertps mask srcidx and reference the source input directly.
42926 int M = TargetMask1[SrcIdx];
42927 assert(0 <= M && M < 8 && "Shuffle index out of range");
42928 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42929 Op1 = Ops1[M < 4 ? 0 : 1];
42930 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42931 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42932 }
42933
42934 // Attempt to merge insertps Op0 with an inner target shuffle node.
42935 SmallVector<int, 8> TargetMask0;
42937 APInt KnownUndef0, KnownZero0;
42938 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42939 KnownZero0)) {
42940 bool Updated = false;
42941 bool UseInput00 = false;
42942 bool UseInput01 = false;
42943 for (int i = 0; i != 4; ++i) {
42944 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42945 // No change if element is already zero or the inserted element.
42946 continue;
42947 }
42948
42949 if (KnownUndef0[i] || KnownZero0[i]) {
42950 // If the target mask is undef/zero then we must zero the element.
42951 InsertPSMask |= (1u << i);
42952 Updated = true;
42953 continue;
42954 }
42955
42956 // The input vector element must be inline.
42957 int M = TargetMask0[i];
42958 if (M != i && M != (i + 4))
42959 return SDValue();
42960
42961 // Determine which inputs of the target shuffle we're using.
42962 UseInput00 |= (0 <= M && M < 4);
42963 UseInput01 |= (4 <= M);
42964 }
42965
42966 // If we're not using both inputs of the target shuffle then use the
42967 // referenced input directly.
42968 if (UseInput00 && !UseInput01) {
42969 Updated = true;
42970 Op0 = Ops0[0];
42971 } else if (!UseInput00 && UseInput01) {
42972 Updated = true;
42973 Op0 = Ops0[1];
42974 }
42975
42976 if (Updated)
42977 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42978 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42979 }
42980
42981 // If we're inserting an element from a vbroadcast load, fold the
42982 // load into the X86insertps instruction. We need to convert the scalar
42983 // load to a vector and clear the source lane of the INSERTPS control.
42984 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42985 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42986 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42987 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42988 MemIntr->getBasePtr(),
42989 MemIntr->getMemOperand());
42990 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42992 Load),
42993 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42994 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42995 return Insert;
42996 }
42997 }
42998
42999 return SDValue();
43000 }
43001 case X86ISD::VPERMV: {
43002 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43004 SmallVector<SDValue, 2> SrcOps, SubOps;
43005 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43006 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43007 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43008 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43009 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43010 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43011 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43012 "Unexpected split ops");
43013 // Bail if we were permuting a widened vector.
43014 if (SubOps[1].isUndef() &&
43015 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43016 return SDValue();
43017 // Bail if any subops would have folded into the concat.
43018 if (any_of(SubOps, isShuffleFoldableLoad))
43019 return SDValue();
43020 // Concat 4x128 back to 2x256.
43021 if (SubOps.size() == 4) {
43022 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43023 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43024 }
43025 // Convert mask to 2 operand shuffle.
43026 int HalfElts = NumElts / 2;
43027 for (int &M : Mask)
43028 M += M >= HalfElts ? HalfElts : 0;
43029 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43030 VT.getSizeInBits());
43031 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43032 VT.getSizeInBits());
43033 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43034 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43035 }
43036 return SDValue();
43037 }
43038 case X86ISD::VPERMV3: {
43039 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43040 bool CanConcat = VT.is128BitVector() ||
43041 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43044 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43045 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43046 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43047 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43048 // Canonicalize to VPERMV if both sources are the same.
43049 if (V1 == V2) {
43050 for (int &M : Mask)
43051 M = (M < 0 ? M : (M & (NumElts - 1)));
43052 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43053 DAG.getUNDEF(VT), Subtarget, DAG);
43054 }
43055 // If sources are half width, then concat and use VPERMV with adjusted
43056 // mask.
43057 SDValue Ops[2];
43058 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43059 if (sd_match(V1,
43061 sd_match(V2,
43063 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43064 if (SDValue ConcatSrc =
43065 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43066 for (int &M : Mask)
43067 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43068 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43069 DAG.getUNDEF(VT), Subtarget, DAG);
43070 }
43071 }
43072 // Commute foldable source to the RHS.
43073 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43074 !isShuffleFoldableLoad(N.getOperand(2))) {
43076 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43077 N.getOperand(0), Subtarget, DAG);
43078 }
43079 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43080 // freely concatenated, with a commuted shuffle mask.
43081 if (CanConcat) {
43082 if (SDValue ConcatSrc = combineConcatVectorOps(
43083 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43084 Subtarget)) {
43086 Mask.append(NumElts, SM_SentinelUndef);
43087 SDValue Perm =
43088 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43089 DAG.getUNDEF(WideVT), Subtarget, DAG);
43090 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43091 DAG.getVectorIdxConstant(0, DL));
43092 }
43093 }
43094 }
43095 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43096 // freely concatenated.
43097 if (CanConcat) {
43098 if (SDValue ConcatSrc = combineConcatVectorOps(
43099 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43100 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43101 DL, WideVT.getSizeInBits());
43102 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43103 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43104 DAG.getVectorIdxConstant(0, DL));
43105 }
43106 }
43107 return SDValue();
43108 }
43109 default:
43110 return SDValue();
43111 }
43112
43113 // Nuke no-op shuffles that show up after combining.
43114 if (isNoopShuffleMask(Mask))
43115 return N.getOperand(0);
43116
43117 // Look for simplifications involving one or two shuffle instructions.
43118 SDValue V = N.getOperand(0);
43119 switch (N.getOpcode()) {
43120 default:
43121 break;
43122 case X86ISD::PSHUFLW:
43123 case X86ISD::PSHUFHW:
43124 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43125
43126 // See if this reduces to a PSHUFD which is no more expensive and can
43127 // combine with more operations. Note that it has to at least flip the
43128 // dwords as otherwise it would have been removed as a no-op.
43129 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43130 int DMask[] = {0, 1, 2, 3};
43131 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43132 DMask[DOffset + 0] = DOffset + 1;
43133 DMask[DOffset + 1] = DOffset + 0;
43134 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43135 V = DAG.getBitcast(DVT, V);
43136 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43137 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43138 return DAG.getBitcast(VT, V);
43139 }
43140
43141 // Look for shuffle patterns which can be implemented as a single unpack.
43142 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43143 // only works when we have a PSHUFD followed by two half-shuffles.
43144 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43145 (V.getOpcode() == X86ISD::PSHUFLW ||
43146 V.getOpcode() == X86ISD::PSHUFHW) &&
43147 V.getOpcode() != N.getOpcode() &&
43148 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43149 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43150 if (D.getOpcode() == X86ISD::PSHUFD) {
43153 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43154 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43155 int WordMask[8];
43156 for (int i = 0; i < 4; ++i) {
43157 WordMask[i + NOffset] = Mask[i] + NOffset;
43158 WordMask[i + VOffset] = VMask[i] + VOffset;
43159 }
43160 // Map the word mask through the DWord mask.
43161 int MappedMask[8];
43162 for (int i = 0; i < 8; ++i)
43163 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43164 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43165 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43166 // We can replace all three shuffles with an unpack.
43167 V = DAG.getBitcast(VT, D.getOperand(0));
43168 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43170 DL, VT, V, V);
43171 }
43172 }
43173 }
43174
43175 break;
43176
43177 case X86ISD::PSHUFD:
43178 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43179 return NewN;
43180
43181 break;
43182 }
43183
43184 return SDValue();
43185}
43186
43187/// Checks if the shuffle mask takes subsequent elements
43188/// alternately from two vectors.
43189/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43190static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43191
43192 int ParitySrc[2] = {-1, -1};
43193 unsigned Size = Mask.size();
43194 for (unsigned i = 0; i != Size; ++i) {
43195 int M = Mask[i];
43196 if (M < 0)
43197 continue;
43198
43199 // Make sure we are using the matching element from the input.
43200 if ((M % Size) != i)
43201 return false;
43202
43203 // Make sure we use the same input for all elements of the same parity.
43204 int Src = M / Size;
43205 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43206 return false;
43207 ParitySrc[i % 2] = Src;
43208 }
43209
43210 // Make sure each input is used.
43211 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43212 return false;
43213
43214 Op0Even = ParitySrc[0] == 0;
43215 return true;
43216}
43217
43218/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43219/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43220/// are written to the parameters \p Opnd0 and \p Opnd1.
43221///
43222/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43223/// so it is easier to generically match. We also insert dummy vector shuffle
43224/// nodes for the operands which explicitly discard the lanes which are unused
43225/// by this operation to try to flow through the rest of the combiner
43226/// the fact that they're unused.
43227static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43228 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43229 bool &IsSubAdd, bool &HasAllowContract) {
43230
43231 EVT VT = N->getValueType(0);
43232 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43233 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43235 return false;
43236
43237 // We only handle target-independent shuffles.
43238 // FIXME: It would be easy and harmless to use the target shuffle mask
43239 // extraction tool to support more.
43240 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43241 return false;
43242
43243 SDValue V1 = N->getOperand(0);
43244 SDValue V2 = N->getOperand(1);
43245
43246 // Make sure we have an FADD and an FSUB.
43247 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43248 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43249 V1.getOpcode() == V2.getOpcode())
43250 return false;
43251
43252 // If there are other uses of these operations we can't fold them.
43253 if (!V1->hasOneUse() || !V2->hasOneUse())
43254 return false;
43255
43256 // Ensure that both operations have the same operands. Note that we can
43257 // commute the FADD operands.
43258 SDValue LHS, RHS;
43259 if (V1.getOpcode() == ISD::FSUB) {
43260 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43261 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43262 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43263 return false;
43264 } else {
43265 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43266 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43267 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43268 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43269 return false;
43270 }
43271
43272 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43273 bool Op0Even;
43274 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43275 return false;
43276
43277 // It's a subadd if the vector in the even parity is an FADD.
43278 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43279 : V2->getOpcode() == ISD::FADD;
43280 HasAllowContract =
43282
43283 Opnd0 = LHS;
43284 Opnd1 = RHS;
43285 return true;
43286}
43287
43288/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43290 const X86Subtarget &Subtarget,
43291 SelectionDAG &DAG) {
43292 // We only handle target-independent shuffles.
43293 // FIXME: It would be easy and harmless to use the target shuffle mask
43294 // extraction tool to support more.
43295 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43296 return SDValue();
43297
43298 MVT VT = N->getSimpleValueType(0);
43299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43300 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43301 return SDValue();
43302
43303 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43304 SDValue Op0 = N->getOperand(0);
43305 SDValue Op1 = N->getOperand(1);
43306 SDValue FMAdd = Op0, FMSub = Op1;
43307 if (FMSub.getOpcode() != X86ISD::FMSUB)
43308 std::swap(FMAdd, FMSub);
43309
43310 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43311 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43312 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43313 FMAdd.getOperand(2) != FMSub.getOperand(2))
43314 return SDValue();
43315
43316 // Check for correct shuffle mask.
43317 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43318 bool Op0Even;
43319 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43320 return SDValue();
43321
43322 // FMAddSub takes zeroth operand from FMSub node.
43323 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43324 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43325 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43326 FMAdd.getOperand(2));
43327}
43328
43329/// Try to combine a shuffle into a target-specific add-sub or
43330/// mul-add-sub node.
43332 const X86Subtarget &Subtarget,
43333 SelectionDAG &DAG) {
43334 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43335 return V;
43336
43337 SDValue Opnd0, Opnd1;
43338 bool IsSubAdd;
43339 bool HasAllowContract;
43340 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43341 HasAllowContract))
43342 return SDValue();
43343
43344 MVT VT = N->getSimpleValueType(0);
43345
43346 // Try to generate X86ISD::FMADDSUB node here.
43347 SDValue Opnd2;
43348 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43349 HasAllowContract)) {
43350 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43351 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43352 }
43353
43354 if (IsSubAdd)
43355 return SDValue();
43356
43357 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43358 // the ADDSUB idiom has been successfully recognized. There are no known
43359 // X86 targets with 512-bit ADDSUB instructions!
43360 if (VT.is512BitVector())
43361 return SDValue();
43362
43363 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43364 // the ADDSUB idiom has been successfully recognized. There are no known
43365 // X86 targets with FP16 ADDSUB instructions!
43366 if (VT.getVectorElementType() == MVT::f16)
43367 return SDValue();
43368
43369 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43370}
43371
43372/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43373/// low half of each source vector and does not set any high half elements in
43374/// the destination vector, narrow the shuffle to half its original size.
43376 EVT VT = Shuf->getValueType(0);
43377 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43378 return SDValue();
43379 if (!VT.is256BitVector() && !VT.is512BitVector())
43380 return SDValue();
43381
43382 // See if we can ignore all of the high elements of the shuffle.
43383 ArrayRef<int> Mask = Shuf->getMask();
43384 if (!isUndefUpperHalf(Mask))
43385 return SDValue();
43386
43387 // Check if the shuffle mask accesses only the low half of each input vector
43388 // (half-index output is 0 or 2).
43389 int HalfIdx1, HalfIdx2;
43390 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43391 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43392 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43393 return SDValue();
43394
43395 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43396 // The trick is knowing that all of the insert/extract are actually free
43397 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43398 // of narrow inputs into a narrow output, and that is always cheaper than
43399 // the wide shuffle that we started with.
43400 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43401 Shuf->getOperand(1), HalfMask, HalfIdx1,
43402 HalfIdx2, false, DAG, /*UseConcat*/ true);
43403}
43404
43407 const X86Subtarget &Subtarget) {
43408 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43409 if (SDValue V = narrowShuffle(Shuf, DAG))
43410 return V;
43411
43412 // If we have legalized the vector types, look for blends of FADD and FSUB
43413 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43414 SDLoc dl(N);
43415 EVT VT = N->getValueType(0);
43416 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43417 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43418 if (SDValue AddSub =
43419 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43420 return AddSub;
43421
43422 // Attempt to combine into a vector load/broadcast.
43424 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43425 return LD;
43426
43427 if (isTargetShuffle(N->getOpcode())) {
43428 SDValue Op(N, 0);
43429 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43430 return Shuffle;
43431
43432 // Try recursively combining arbitrary sequences of x86 shuffle
43433 // instructions into higher-order shuffles. We do this after combining
43434 // specific PSHUF instruction sequences into their minimal form so that we
43435 // can evaluate how many specialized shuffle instructions are involved in
43436 // a particular chain.
43437 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43438 return Res;
43439
43440 // Simplify source operands based on shuffle mask.
43441 // TODO - merge this into combineX86ShufflesRecursively.
43442 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43443 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43444 return SDValue(N, 0);
43445
43446 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43447 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43448 // Perform this after other shuffle combines to allow inner shuffles to be
43449 // combined away first.
43450 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43451 return BinOp;
43452 }
43453
43454 return SDValue();
43455}
43456
43457// Simplify variable target shuffle masks based on the demanded elements.
43458// TODO: Handle DemandedBits in mask indices as well?
43460 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43461 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43462 // If we're demanding all elements don't bother trying to simplify the mask.
43463 unsigned NumElts = DemandedElts.getBitWidth();
43464 if (DemandedElts.isAllOnes())
43465 return false;
43466
43467 SDValue Mask = Op.getOperand(MaskIndex);
43468 if (!Mask.hasOneUse())
43469 return false;
43470
43471 // Attempt to generically simplify the variable shuffle mask.
43472 APInt MaskUndef, MaskZero;
43473 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43474 Depth + 1))
43475 return true;
43476
43477 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43478 // TODO: Support other types from getTargetShuffleMaskIndices?
43480 EVT BCVT = BC.getValueType();
43481 auto *Load = dyn_cast<LoadSDNode>(BC);
43482 if (!Load || !Load->getBasePtr().hasOneUse())
43483 return false;
43484
43485 const Constant *C = getTargetConstantFromNode(Load);
43486 if (!C)
43487 return false;
43488
43489 Type *CTy = C->getType();
43490 if (!CTy->isVectorTy() ||
43491 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43492 return false;
43493
43494 // Handle scaling for i64 elements on 32-bit targets.
43495 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43496 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43497 return false;
43498 unsigned Scale = NumCstElts / NumElts;
43499
43500 // Simplify mask if we have an undemanded element that is not undef.
43501 bool Simplified = false;
43502 SmallVector<Constant *, 32> ConstVecOps;
43503 for (unsigned i = 0; i != NumCstElts; ++i) {
43504 Constant *Elt = C->getAggregateElement(i);
43505 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43506 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43507 Simplified = true;
43508 continue;
43509 }
43510 ConstVecOps.push_back(Elt);
43511 }
43512 if (!Simplified)
43513 return false;
43514
43515 // Generate new constant pool entry + legalize immediately for the load.
43516 SDLoc DL(Op);
43517 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43518 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43519 SDValue NewMask = TLO.DAG.getLoad(
43520 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43522 Load->getAlign());
43523 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43524}
43525
43527 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43528 TargetLoweringOpt &TLO, unsigned Depth) const {
43529 int NumElts = DemandedElts.getBitWidth();
43530 unsigned Opc = Op.getOpcode();
43531 EVT VT = Op.getValueType();
43532
43533 // Handle special case opcodes.
43534 switch (Opc) {
43535 case X86ISD::PMULDQ:
43536 case X86ISD::PMULUDQ: {
43537 APInt LHSUndef, LHSZero;
43538 APInt RHSUndef, RHSZero;
43539 SDValue LHS = Op.getOperand(0);
43540 SDValue RHS = Op.getOperand(1);
43541 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43542 Depth + 1))
43543 return true;
43544 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43545 Depth + 1))
43546 return true;
43547 // Multiply by zero.
43548 KnownZero = LHSZero | RHSZero;
43549 break;
43550 }
43551 case X86ISD::VPMADDUBSW:
43552 case X86ISD::VPMADDWD: {
43553 APInt LHSUndef, LHSZero;
43554 APInt RHSUndef, RHSZero;
43555 SDValue LHS = Op.getOperand(0);
43556 SDValue RHS = Op.getOperand(1);
43557 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43558
43559 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43560 Depth + 1))
43561 return true;
43562 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43563 Depth + 1))
43564 return true;
43565
43566 // TODO: Multiply by zero.
43567
43568 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43569 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43570 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43571 Depth + 1))
43572 return true;
43573 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43574 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43575 Depth + 1))
43576 return true;
43577 break;
43578 }
43579 case X86ISD::PSADBW: {
43580 SDValue LHS = Op.getOperand(0);
43581 SDValue RHS = Op.getOperand(1);
43582 assert(VT.getScalarType() == MVT::i64 &&
43583 LHS.getValueType() == RHS.getValueType() &&
43584 LHS.getValueType().getScalarType() == MVT::i8 &&
43585 "Unexpected PSADBW types");
43586
43587 // Aggressively peek through ops to get at the demanded elts.
43588 if (!DemandedElts.isAllOnes()) {
43589 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43590 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43592 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43594 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43595 if (NewLHS || NewRHS) {
43596 NewLHS = NewLHS ? NewLHS : LHS;
43597 NewRHS = NewRHS ? NewRHS : RHS;
43598 return TLO.CombineTo(
43599 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43600 }
43601 }
43602 break;
43603 }
43604 case X86ISD::VSHL:
43605 case X86ISD::VSRL:
43606 case X86ISD::VSRA: {
43607 // We only need the bottom 64-bits of the (128-bit) shift amount.
43608 SDValue Amt = Op.getOperand(1);
43609 MVT AmtVT = Amt.getSimpleValueType();
43610 assert(AmtVT.is128BitVector() && "Unexpected value type");
43611
43612 // If we reuse the shift amount just for sse shift amounts then we know that
43613 // only the bottom 64-bits are only ever used.
43614 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43615 unsigned UseOpc = Use->getOpcode();
43616 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43617 UseOpc == X86ISD::VSRA) &&
43618 Use->getOperand(0) != Amt;
43619 });
43620
43621 APInt AmtUndef, AmtZero;
43622 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43623 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43624 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43625 Depth + 1, AssumeSingleUse))
43626 return true;
43627 [[fallthrough]];
43628 }
43629 case X86ISD::VSHLI:
43630 case X86ISD::VSRLI:
43631 case X86ISD::VSRAI: {
43632 SDValue Src = Op.getOperand(0);
43633 APInt SrcUndef;
43634 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43635 Depth + 1))
43636 return true;
43637
43638 // Fold shift(0,x) -> 0
43639 if (DemandedElts.isSubsetOf(KnownZero))
43640 return TLO.CombineTo(
43641 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43642
43643 // Aggressively peek through ops to get at the demanded elts.
43644 if (!DemandedElts.isAllOnes())
43646 Src, DemandedElts, TLO.DAG, Depth + 1))
43647 return TLO.CombineTo(
43648 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43649 break;
43650 }
43651 case X86ISD::VPSHA:
43652 case X86ISD::VPSHL:
43653 case X86ISD::VSHLV:
43654 case X86ISD::VSRLV:
43655 case X86ISD::VSRAV: {
43656 APInt LHSUndef, LHSZero;
43657 APInt RHSUndef, RHSZero;
43658 SDValue LHS = Op.getOperand(0);
43659 SDValue RHS = Op.getOperand(1);
43660 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43661 Depth + 1))
43662 return true;
43663
43664 // Fold shift(0,x) -> 0
43665 if (DemandedElts.isSubsetOf(LHSZero))
43666 return TLO.CombineTo(
43667 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43668
43669 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43670 Depth + 1))
43671 return true;
43672
43673 KnownZero = LHSZero;
43674 break;
43675 }
43676 case X86ISD::CMPM:
43677 case X86ISD::CMPP: {
43678 // Scalarize packed fp comparison if we only require element 0.
43679 if (DemandedElts == 1) {
43680 SDLoc dl(Op);
43681 MVT VT = Op.getSimpleValueType();
43682 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43683 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43684 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43685 SDValue CC = Op.getOperand(2);
43686 if (Opc == X86ISD::CMPM) {
43687 SDValue Cmp =
43688 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43689 return TLO.CombineTo(
43690 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43691 }
43692 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43693 return TLO.CombineTo(Op,
43694 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43695 }
43696 break;
43697 }
43698 case X86ISD::PCMPEQ:
43699 case X86ISD::PCMPGT: {
43700 APInt LHSUndef, LHSZero;
43701 APInt RHSUndef, RHSZero;
43702 SDValue LHS = Op.getOperand(0);
43703 SDValue RHS = Op.getOperand(1);
43704 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43705 Depth + 1))
43706 return true;
43707 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43708 Depth + 1))
43709 return true;
43710 break;
43711 }
43712 case X86ISD::KSHIFTL: {
43713 SDValue Src = Op.getOperand(0);
43714 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43715 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43716 unsigned ShiftAmt = Amt->getZExtValue();
43717
43718 if (ShiftAmt == 0)
43719 return TLO.CombineTo(Op, Src);
43720
43721 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43722 // single shift. We can do this if the bottom bits (which are shifted
43723 // out) are never demanded.
43724 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43725 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43726 unsigned C1 = Src.getConstantOperandVal(1);
43727 unsigned NewOpc = X86ISD::KSHIFTL;
43728 int Diff = ShiftAmt - C1;
43729 if (Diff < 0) {
43730 Diff = -Diff;
43731 NewOpc = X86ISD::KSHIFTR;
43732 }
43733
43734 SDLoc dl(Op);
43735 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43736 return TLO.CombineTo(
43737 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43738 }
43739 }
43740
43741 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43742 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43743 Depth + 1))
43744 return true;
43745
43746 KnownUndef <<= ShiftAmt;
43747 KnownZero <<= ShiftAmt;
43748 KnownZero.setLowBits(ShiftAmt);
43749 break;
43750 }
43751 case X86ISD::KSHIFTR: {
43752 SDValue Src = Op.getOperand(0);
43753 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43754 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43755 unsigned ShiftAmt = Amt->getZExtValue();
43756
43757 if (ShiftAmt == 0)
43758 return TLO.CombineTo(Op, Src);
43759
43760 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43761 // single shift. We can do this if the top bits (which are shifted
43762 // out) are never demanded.
43763 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43764 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43765 unsigned C1 = Src.getConstantOperandVal(1);
43766 unsigned NewOpc = X86ISD::KSHIFTR;
43767 int Diff = ShiftAmt - C1;
43768 if (Diff < 0) {
43769 Diff = -Diff;
43770 NewOpc = X86ISD::KSHIFTL;
43771 }
43772
43773 SDLoc dl(Op);
43774 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43775 return TLO.CombineTo(
43776 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43777 }
43778 }
43779
43780 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43781 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43782 Depth + 1))
43783 return true;
43784
43785 KnownUndef.lshrInPlace(ShiftAmt);
43786 KnownZero.lshrInPlace(ShiftAmt);
43787 KnownZero.setHighBits(ShiftAmt);
43788 break;
43789 }
43790 case X86ISD::ANDNP: {
43791 // ANDNP = (~LHS & RHS);
43792 SDValue LHS = Op.getOperand(0);
43793 SDValue RHS = Op.getOperand(1);
43794
43795 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43796 APInt UndefElts;
43797 SmallVector<APInt> EltBits;
43798 int NumElts = VT.getVectorNumElements();
43799 int EltSizeInBits = VT.getScalarSizeInBits();
43800 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43801 APInt OpElts = DemandedElts;
43802 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43803 EltBits)) {
43804 OpBits.clearAllBits();
43805 OpElts.clearAllBits();
43806 for (int I = 0; I != NumElts; ++I) {
43807 if (!DemandedElts[I])
43808 continue;
43809 if (UndefElts[I]) {
43810 // We can't assume an undef src element gives an undef dst - the
43811 // other src might be zero.
43812 OpBits.setAllBits();
43813 OpElts.setBit(I);
43814 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43815 (!Invert && !EltBits[I].isZero())) {
43816 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43817 OpElts.setBit(I);
43818 }
43819 }
43820 }
43821 return std::make_pair(OpBits, OpElts);
43822 };
43823 APInt BitsLHS, EltsLHS;
43824 APInt BitsRHS, EltsRHS;
43825 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43826 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43827
43828 APInt LHSUndef, LHSZero;
43829 APInt RHSUndef, RHSZero;
43830 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43831 Depth + 1))
43832 return true;
43833 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43834 Depth + 1))
43835 return true;
43836
43837 if (!DemandedElts.isAllOnes()) {
43838 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43839 TLO.DAG, Depth + 1);
43840 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43841 TLO.DAG, Depth + 1);
43842 if (NewLHS || NewRHS) {
43843 NewLHS = NewLHS ? NewLHS : LHS;
43844 NewRHS = NewRHS ? NewRHS : RHS;
43845 return TLO.CombineTo(
43846 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43847 }
43848 }
43849 break;
43850 }
43851 case X86ISD::CVTSI2P:
43852 case X86ISD::CVTUI2P:
43853 case X86ISD::CVTPH2PS:
43854 case X86ISD::CVTPS2PH: {
43855 SDValue Src = Op.getOperand(0);
43856 EVT SrcVT = Src.getValueType();
43857 APInt SrcUndef, SrcZero;
43858 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43859 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43860 Depth + 1))
43861 return true;
43862 break;
43863 }
43864 case X86ISD::PACKSS:
43865 case X86ISD::PACKUS: {
43866 SDValue N0 = Op.getOperand(0);
43867 SDValue N1 = Op.getOperand(1);
43868
43869 APInt DemandedLHS, DemandedRHS;
43870 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43871
43872 APInt LHSUndef, LHSZero;
43873 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43874 Depth + 1))
43875 return true;
43876 APInt RHSUndef, RHSZero;
43877 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43878 Depth + 1))
43879 return true;
43880
43881 // TODO - pass on known zero/undef.
43882
43883 // Aggressively peek through ops to get at the demanded elts.
43884 // TODO - we should do this for all target/faux shuffles ops.
43885 if (!DemandedElts.isAllOnes()) {
43886 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43887 TLO.DAG, Depth + 1);
43888 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43889 TLO.DAG, Depth + 1);
43890 if (NewN0 || NewN1) {
43891 NewN0 = NewN0 ? NewN0 : N0;
43892 NewN1 = NewN1 ? NewN1 : N1;
43893 return TLO.CombineTo(Op,
43894 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43895 }
43896 }
43897 break;
43898 }
43899 case X86ISD::HADD:
43900 case X86ISD::HSUB:
43901 case X86ISD::FHADD:
43902 case X86ISD::FHSUB: {
43903 SDValue N0 = Op.getOperand(0);
43904 SDValue N1 = Op.getOperand(1);
43905
43906 APInt DemandedLHS, DemandedRHS;
43907 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43908
43909 APInt LHSUndef, LHSZero;
43910 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43911 Depth + 1))
43912 return true;
43913 APInt RHSUndef, RHSZero;
43914 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43915 Depth + 1))
43916 return true;
43917
43918 // TODO - pass on known zero/undef.
43919
43920 // Aggressively peek through ops to get at the demanded elts.
43921 // TODO: Handle repeated operands.
43922 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43923 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43924 TLO.DAG, Depth + 1);
43925 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43926 TLO.DAG, Depth + 1);
43927 if (NewN0 || NewN1) {
43928 NewN0 = NewN0 ? NewN0 : N0;
43929 NewN1 = NewN1 ? NewN1 : N1;
43930 return TLO.CombineTo(Op,
43931 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43932 }
43933 }
43934 break;
43935 }
43936 case X86ISD::VTRUNC:
43937 case X86ISD::VTRUNCS:
43938 case X86ISD::VTRUNCUS: {
43939 SDValue Src = Op.getOperand(0);
43940 MVT SrcVT = Src.getSimpleValueType();
43941 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43942 APInt SrcUndef, SrcZero;
43943 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43944 Depth + 1))
43945 return true;
43946 KnownZero = SrcZero.zextOrTrunc(NumElts);
43947 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43948 break;
43949 }
43950 case X86ISD::BLENDI: {
43951 SmallVector<int, 16> BlendMask;
43952 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43954 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43955 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43956 return TLO.CombineTo(Op, R);
43957 break;
43958 }
43959 case X86ISD::BLENDV: {
43960 APInt SelUndef, SelZero;
43961 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43962 SelZero, TLO, Depth + 1))
43963 return true;
43964
43965 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43966 APInt LHSUndef, LHSZero;
43967 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43968 LHSZero, TLO, Depth + 1))
43969 return true;
43970
43971 APInt RHSUndef, RHSZero;
43972 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43973 RHSZero, TLO, Depth + 1))
43974 return true;
43975
43976 KnownZero = LHSZero & RHSZero;
43977 KnownUndef = LHSUndef & RHSUndef;
43978 break;
43979 }
43980 case X86ISD::VZEXT_MOVL: {
43981 // If upper demanded elements are already zero then we have nothing to do.
43982 SDValue Src = Op.getOperand(0);
43983 APInt DemandedUpperElts = DemandedElts;
43984 DemandedUpperElts.clearLowBits(1);
43985 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43986 return TLO.CombineTo(Op, Src);
43987 break;
43988 }
43989 case X86ISD::VZEXT_LOAD: {
43990 // If upper demanded elements are not demanded then simplify to a
43991 // scalar_to_vector(load()).
43993 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43994 SDLoc DL(Op);
43995 auto *Mem = cast<MemSDNode>(Op);
43996 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43997 Mem->getMemOperand());
43998 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43999 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
44000 }
44001 break;
44002 }
44003 case X86ISD::VBROADCAST: {
44004 SDValue Src = Op.getOperand(0);
44005 MVT SrcVT = Src.getSimpleValueType();
44006 // Don't bother broadcasting if we just need the 0'th element.
44007 if (DemandedElts == 1) {
44008 if (!SrcVT.isVector())
44009 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44010 else if (Src.getValueType() != VT)
44011 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44012 SDLoc(Op));
44013 return TLO.CombineTo(Op, Src);
44014 }
44015 if (!SrcVT.isVector())
44016 break;
44017 APInt SrcUndef, SrcZero;
44018 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44019 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44020 Depth + 1))
44021 return true;
44022 // Aggressively peek through src to get at the demanded elt.
44023 // TODO - we should do this for all target/faux shuffles ops.
44025 Src, SrcElts, TLO.DAG, Depth + 1))
44026 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44027 break;
44028 }
44029 case X86ISD::VPERMV:
44030 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44031 Depth))
44032 return true;
44033 break;
44034 case X86ISD::PSHUFB:
44035 case X86ISD::VPERMV3:
44036 case X86ISD::VPERMILPV:
44037 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44038 Depth))
44039 return true;
44040 break;
44041 case X86ISD::VPPERM:
44042 case X86ISD::VPERMIL2:
44043 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44044 Depth))
44045 return true;
44046 break;
44047 }
44048
44049 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44050 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44051 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44052 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44053 DemandedElts.lshr(NumElts / 2) == 0) {
44054 unsigned SizeInBits = VT.getSizeInBits();
44055 unsigned ExtSizeInBits = SizeInBits / 2;
44056
44057 // See if 512-bit ops only use the bottom 128-bits.
44058 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44059 ExtSizeInBits = SizeInBits / 4;
44060
44061 switch (Opc) {
44062 // Scalar broadcast.
44063 case X86ISD::VBROADCAST: {
44064 SDLoc DL(Op);
44065 SDValue Src = Op.getOperand(0);
44066 if (Src.getValueSizeInBits() > ExtSizeInBits)
44067 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44068 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44069 ExtSizeInBits / VT.getScalarSizeInBits());
44070 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44071 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44072 TLO.DAG, DL, ExtSizeInBits));
44073 }
44075 SDLoc DL(Op);
44076 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44077 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44078 ExtSizeInBits / VT.getScalarSizeInBits());
44079 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44080 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44081 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44082 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44083 MemIntr->getMemOperand());
44085 Bcst.getValue(1));
44086 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44087 TLO.DAG, DL, ExtSizeInBits));
44088 }
44089 // Subvector broadcast.
44091 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44092 EVT MemVT = MemIntr->getMemoryVT();
44093 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44094 SDLoc DL(Op);
44095 SDValue Ld =
44096 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44097 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44099 Ld.getValue(1));
44100 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44101 TLO.DAG, DL, ExtSizeInBits));
44102 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44103 SDLoc DL(Op);
44104 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44105 ExtSizeInBits / VT.getScalarSizeInBits());
44106 if (SDValue BcstLd =
44107 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44108 return TLO.CombineTo(Op,
44109 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44110 TLO.DAG, DL, ExtSizeInBits));
44111 }
44112 break;
44113 }
44114 // Byte shifts by immediate.
44115 case X86ISD::VSHLDQ:
44116 case X86ISD::VSRLDQ:
44117 // Shift by uniform.
44118 case X86ISD::VSHL:
44119 case X86ISD::VSRL:
44120 case X86ISD::VSRA:
44121 // Shift by immediate.
44122 case X86ISD::VSHLI:
44123 case X86ISD::VSRLI:
44124 case X86ISD::VSRAI: {
44125 SDLoc DL(Op);
44126 SDValue Ext0 =
44127 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44128 SDValue ExtOp =
44129 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44130 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44131 SDValue Insert =
44132 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44133 return TLO.CombineTo(Op, Insert);
44134 }
44135 case X86ISD::VPERMI: {
44136 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44137 // TODO: This should be done in shuffle combining.
44138 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44140 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44141 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44142 SDLoc DL(Op);
44143 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44144 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44145 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44146 return TLO.CombineTo(Op, Insert);
44147 }
44148 }
44149 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44150 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44151 SDLoc DL(Op);
44152 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44153 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44154 Op.getOperand(1));
44155 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44156 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44157 return TLO.CombineTo(Op, Insert);
44158 }
44159 break;
44160 }
44161 case X86ISD::VPERMV: {
44164 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44165 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44166 VT == MVT::v16f32) &&
44167 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44168 // For lane-crossing shuffles, only split in half in case we're still
44169 // referencing higher elements.
44170 unsigned HalfElts = NumElts / 2;
44171 unsigned HalfSize = SizeInBits / 2;
44172 Mask.resize(HalfElts);
44173 if (all_of(Mask,
44174 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44176 SDLoc DL(Op);
44177 SDValue Ext;
44178 SDValue M =
44179 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44180 SDValue V =
44181 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44182 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44183 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44184 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44185 else {
44187 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44188 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44189 TLO.DAG.getBitcast(ShufVT, V), M);
44190 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44191 }
44192 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44193 Subtarget, TLO.DAG, DL, SizeInBits);
44194 return TLO.CombineTo(Op, Insert);
44195 }
44196 }
44197 break;
44198 }
44199 case X86ISD::VPERMV3: {
44202 if (Subtarget.hasVLX() &&
44203 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44204 // For lane-crossing shuffles, only split in half in case we're still
44205 // referencing higher elements.
44206 unsigned HalfElts = NumElts / 2;
44207 unsigned HalfSize = SizeInBits / 2;
44208 Mask.resize(HalfElts);
44209 if (all_of(Mask, [&](int M) {
44210 return isUndefOrInRange(M, 0, HalfElts) ||
44211 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44212 })) {
44213 // Adjust mask elements for 2nd operand to point to half width.
44214 for (int &M : Mask)
44215 M = (M < NumElts) ? M : (M - HalfElts);
44217 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44218 SDLoc DL(Op);
44219 SDValue Ext = TLO.DAG.getNode(
44220 Opc, DL, HalfVT,
44221 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44222 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44223 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44224 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44225 Subtarget, TLO.DAG, DL, SizeInBits);
44226 return TLO.CombineTo(Op, Insert);
44227 }
44228 }
44229 break;
44230 }
44231 case X86ISD::VPERM2X128: {
44232 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44233 SDLoc DL(Op);
44234 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44235 if (LoMask & 0x8)
44236 return TLO.CombineTo(
44237 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44238 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44239 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44240 SDValue ExtOp =
44241 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44242 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44243 SDValue Insert =
44244 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44245 return TLO.CombineTo(Op, Insert);
44246 }
44247 // Conversions.
44248 // TODO: Add more CVT opcodes when we have test coverage.
44249 case X86ISD::CVTTP2UI: {
44250 if (!Subtarget.hasVLX())
44251 break;
44252 [[fallthrough]];
44253 }
44254 case X86ISD::CVTTP2SI: {
44255 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44256 !Subtarget.hasVLX())
44257 break;
44258 [[fallthrough]];
44259 }
44260 case X86ISD::CVTPH2PS: {
44261 SDLoc DL(Op);
44262 unsigned Scale = SizeInBits / ExtSizeInBits;
44263 SDValue SrcOp = Op.getOperand(0);
44264 MVT SrcVT = SrcOp.getSimpleValueType();
44265 unsigned SrcExtSize =
44266 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44268 ExtSizeInBits / VT.getScalarSizeInBits());
44269 SDValue ExtOp = TLO.DAG.getNode(
44270 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44271 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44272 SDValue Insert =
44273 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44274 return TLO.CombineTo(Op, Insert);
44275 }
44276 // Zero upper elements.
44277 case X86ISD::VZEXT_MOVL:
44278 // Variable blend.
44279 case X86ISD::BLENDV:
44280 // Target unary shuffles:
44281 case X86ISD::MOVDDUP:
44282 // Target unary shuffles by immediate:
44283 case X86ISD::PSHUFD:
44284 case X86ISD::PSHUFLW:
44285 case X86ISD::PSHUFHW:
44286 case X86ISD::VPERMILPI:
44287 // (Non-Lane Crossing) Target Shuffles.
44288 case X86ISD::VPERMILPV:
44289 case X86ISD::VPERMIL2:
44290 case X86ISD::PSHUFB:
44291 case X86ISD::UNPCKL:
44292 case X86ISD::UNPCKH:
44293 case X86ISD::BLENDI:
44294 // Integer ops.
44295 case X86ISD::PACKSS:
44296 case X86ISD::PACKUS:
44297 case X86ISD::PCMPEQ:
44298 case X86ISD::PCMPGT:
44299 case X86ISD::PMULUDQ:
44300 case X86ISD::PMULDQ:
44301 case X86ISD::VSHLV:
44302 case X86ISD::VSRLV:
44303 case X86ISD::VSRAV:
44304 // Float ops.
44305 case X86ISD::FMAX:
44306 case X86ISD::FMIN:
44307 case X86ISD::FMAXC:
44308 case X86ISD::FMINC:
44309 case X86ISD::FRSQRT:
44310 case X86ISD::FRCP:
44311 // Horizontal Ops.
44312 case X86ISD::HADD:
44313 case X86ISD::HSUB:
44314 case X86ISD::FHADD:
44315 case X86ISD::FHSUB: {
44316 SDLoc DL(Op);
44318 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44319 SDValue SrcOp = Op.getOperand(i);
44320 EVT SrcVT = SrcOp.getValueType();
44321 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44322 "Unsupported vector size");
44323 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44324 ExtSizeInBits)
44325 : SrcOp);
44326 }
44327 MVT ExtVT = VT.getSimpleVT();
44328 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44329 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44330 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44331 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44332 SDValue Insert =
44333 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44334 return TLO.CombineTo(Op, Insert);
44335 }
44336 }
44337 }
44338
44339 // For splats, unless we *only* demand the 0'th element,
44340 // stop attempts at simplification here, we aren't going to improve things,
44341 // this is better than any potential shuffle.
44342 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44343 return false;
44344
44345 // Get target/faux shuffle mask.
44346 APInt OpUndef, OpZero;
44347 SmallVector<int, 64> OpMask;
44348 SmallVector<SDValue, 2> OpInputs;
44349 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44350 OpZero, TLO.DAG, Depth, false))
44351 return false;
44352
44353 // Shuffle inputs must be the same size as the result.
44354 if (OpMask.size() != (unsigned)NumElts ||
44355 llvm::any_of(OpInputs, [VT](SDValue V) {
44356 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44357 !V.getValueType().isVector();
44358 }))
44359 return false;
44360
44361 KnownZero = OpZero;
44362 KnownUndef = OpUndef;
44363
44364 // Check if shuffle mask can be simplified to undef/zero/identity.
44365 int NumSrcs = OpInputs.size();
44366 for (int i = 0; i != NumElts; ++i)
44367 if (!DemandedElts[i])
44368 OpMask[i] = SM_SentinelUndef;
44369
44370 if (isUndefInRange(OpMask, 0, NumElts)) {
44371 KnownUndef.setAllBits();
44372 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44373 }
44374 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44375 KnownZero.setAllBits();
44376 return TLO.CombineTo(
44377 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44378 }
44379 for (int Src = 0; Src != NumSrcs; ++Src)
44380 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44381 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44382
44383 // Attempt to simplify inputs.
44384 for (int Src = 0; Src != NumSrcs; ++Src) {
44385 // TODO: Support inputs of different types.
44386 if (OpInputs[Src].getValueType() != VT)
44387 continue;
44388
44389 int Lo = Src * NumElts;
44390 APInt SrcElts = APInt::getZero(NumElts);
44391 for (int i = 0; i != NumElts; ++i)
44392 if (DemandedElts[i]) {
44393 int M = OpMask[i] - Lo;
44394 if (0 <= M && M < NumElts)
44395 SrcElts.setBit(M);
44396 }
44397
44398 // TODO - Propagate input undef/zero elts.
44399 APInt SrcUndef, SrcZero;
44400 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44401 TLO, Depth + 1))
44402 return true;
44403 }
44404
44405 // If we don't demand all elements, then attempt to combine to a simpler
44406 // shuffle.
44407 // We need to convert the depth to something combineX86ShufflesRecursively
44408 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44409 // to match. This prevents combineX86ShuffleChain from returning a
44410 // combined shuffle that's the same as the original root, causing an
44411 // infinite loop.
44412 if (!DemandedElts.isAllOnes()) {
44413 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44414
44415 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44416 for (int i = 0; i != NumElts; ++i)
44417 if (DemandedElts[i])
44418 DemandedMask[i] = i;
44419
44421 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44423 /*AllowVariableCrossLaneMask=*/true,
44424 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44425 TLO.DAG, SDLoc(Op), Subtarget);
44426 if (NewShuffle)
44427 return TLO.CombineTo(Op, NewShuffle);
44428 }
44429
44430 return false;
44431}
44432
44434 SDValue Op, const APInt &OriginalDemandedBits,
44435 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44436 unsigned Depth) const {
44437 EVT VT = Op.getValueType();
44438 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44439 unsigned Opc = Op.getOpcode();
44440 switch(Opc) {
44441 case X86ISD::VTRUNC: {
44442 KnownBits KnownOp;
44443 SDValue Src = Op.getOperand(0);
44444 MVT SrcVT = Src.getSimpleValueType();
44445
44446 // Simplify the input, using demanded bit information.
44447 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44448 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44449 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44450 return true;
44451 break;
44452 }
44453 case X86ISD::PMULDQ:
44454 case X86ISD::PMULUDQ: {
44455 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44456 KnownBits KnownLHS, KnownRHS;
44457 SDValue LHS = Op.getOperand(0);
44458 SDValue RHS = Op.getOperand(1);
44459
44460 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44461 // FIXME: Can we bound this better?
44462 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44463 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44464 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44465
44466 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44467 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44468 DemandedMaskLHS = DemandedMask;
44469 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44470 DemandedMaskRHS = DemandedMask;
44471
44472 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44473 KnownLHS, TLO, Depth + 1))
44474 return true;
44475 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44476 KnownRHS, TLO, Depth + 1))
44477 return true;
44478
44479 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44480 KnownRHS = KnownRHS.trunc(32);
44481 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44482 KnownRHS.getConstant().isOne()) {
44483 SDLoc DL(Op);
44484 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44485 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44486 }
44487
44488 // Aggressively peek through ops to get at the demanded low bits.
44490 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44492 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44493 if (DemandedLHS || DemandedRHS) {
44494 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44495 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44496 return TLO.CombineTo(
44497 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44498 }
44499 break;
44500 }
44501 case X86ISD::ANDNP: {
44502 KnownBits Known2;
44503 SDValue Op0 = Op.getOperand(0);
44504 SDValue Op1 = Op.getOperand(1);
44505
44506 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44507 Known, TLO, Depth + 1))
44508 return true;
44509
44510 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44511 OriginalDemandedElts, Known2, TLO, Depth + 1))
44512 return true;
44513
44514 // If the RHS is a constant, see if we can simplify it.
44515 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44516 OriginalDemandedElts, TLO))
44517 return true;
44518
44519 // ANDNP = (~Op0 & Op1);
44520 Known.One &= Known2.Zero;
44521 Known.Zero |= Known2.One;
44522 break;
44523 }
44524 case X86ISD::VSHLI: {
44525 SDValue Op0 = Op.getOperand(0);
44526 SDValue Op1 = Op.getOperand(1);
44527
44528 unsigned ShAmt = Op1->getAsZExtVal();
44529 if (ShAmt >= BitWidth)
44530 break;
44531
44532 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44533
44534 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44535 // single shift. We can do this if the bottom bits (which are shifted
44536 // out) are never demanded.
44537 if (Op0.getOpcode() == X86ISD::VSRLI &&
44538 OriginalDemandedBits.countr_zero() >= ShAmt) {
44539 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44540 if (Shift2Amt < BitWidth) {
44541 int Diff = ShAmt - Shift2Amt;
44542 if (Diff == 0)
44543 return TLO.CombineTo(Op, Op0.getOperand(0));
44544
44545 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44546 SDValue NewShift = TLO.DAG.getNode(
44547 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44548 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44549 return TLO.CombineTo(Op, NewShift);
44550 }
44551 }
44552
44553 // If we are only demanding sign bits then we can use the shift source directly.
44554 unsigned NumSignBits =
44555 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44556 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44557 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44558 return TLO.CombineTo(Op, Op0);
44559
44560 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44561 TLO, Depth + 1))
44562 return true;
44563
44564 Known <<= ShAmt;
44565
44566 // Low bits known zero.
44567 Known.Zero.setLowBits(ShAmt);
44568
44569 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44570 // Attempt to avoid multi-use ops if we don't need anything from them.
44571 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44572 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44573 SDValue NewOp =
44574 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44575 return TLO.CombineTo(Op, NewOp);
44576 }
44577 }
44578 return false;
44579 }
44580 case X86ISD::VSRLI: {
44581 SDValue Op0 = Op.getOperand(0);
44582 SDValue Op1 = Op.getOperand(1);
44583
44584 unsigned ShAmt = Op1->getAsZExtVal();
44585 if (ShAmt >= BitWidth)
44586 break;
44587
44588 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44589
44590 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44591 TLO, Depth + 1))
44592 return true;
44593
44594 Known >>= ShAmt;
44595
44596 // High bits known zero.
44597 Known.Zero.setHighBits(ShAmt);
44598
44599 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44600 // Attempt to avoid multi-use ops if we don't need anything from them.
44601 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44602 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44603 SDValue NewOp =
44604 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44605 return TLO.CombineTo(Op, NewOp);
44606 }
44607 }
44608 return false;
44609 }
44610 case X86ISD::VSRAI: {
44611 SDValue Op0 = Op.getOperand(0);
44612 SDValue Op1 = Op.getOperand(1);
44613
44614 unsigned ShAmt = Op1->getAsZExtVal();
44615 if (ShAmt >= BitWidth)
44616 break;
44617
44618 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44619
44620 // If we only want bits that already match the signbit then we don't need
44621 // to shift.
44622 unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44623 if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
44624 NumHiDemandedBits)
44625 return TLO.CombineTo(Op, Op0);
44626
44627 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44628 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44629 SDValue Op00 = Op0.getOperand(0);
44630 unsigned NumSignBits =
44631 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44632 if (ShAmt < NumSignBits)
44633 return TLO.CombineTo(Op, Op00);
44634 }
44635
44636 // If any of the demanded bits are produced by the sign extension, we also
44637 // demand the input sign bit.
44638 if (OriginalDemandedBits.countl_zero() < ShAmt)
44639 DemandedMask.setSignBit();
44640
44641 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44642 TLO, Depth + 1))
44643 return true;
44644
44645 Known >>= ShAmt;
44646
44647 // If the input sign bit is known to be zero, or if none of the top bits
44648 // are demanded, turn this into an unsigned shift right.
44649 if (Known.Zero[BitWidth - ShAmt - 1] ||
44650 OriginalDemandedBits.countl_zero() >= ShAmt)
44651 return TLO.CombineTo(
44652 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44653
44654 // High bits are known one.
44655 if (Known.One[BitWidth - ShAmt - 1])
44656 Known.One.setHighBits(ShAmt);
44657
44658 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44659 // Attempt to avoid multi-use ops if we don't need anything from them.
44660 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44661 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44662 SDValue NewOp =
44663 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44664 return TLO.CombineTo(Op, NewOp);
44665 }
44666 }
44667 return false;
44668 }
44669 case X86ISD::BLENDI: {
44670 SDValue LHS = Op.getOperand(0);
44671 SDValue RHS = Op.getOperand(1);
44672 APInt Mask = getBLENDIBlendMask(Op);
44673
44674 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44675 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44676 TLO, Depth + 1))
44677 return true;
44678
44679 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44680 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44681 TLO, Depth + 1))
44682 return true;
44683
44684 // Attempt to avoid multi-use ops if we don't need anything from them.
44686 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44688 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44689 if (NewLHS || NewRHS) {
44690 NewLHS = NewLHS ? NewLHS : LHS;
44691 NewRHS = NewRHS ? NewRHS : RHS;
44692 return TLO.CombineTo(Op,
44693 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44694 NewLHS, NewRHS, Op.getOperand(2)));
44695 }
44696 break;
44697 }
44698 case X86ISD::BLENDV: {
44699 SDValue Sel = Op.getOperand(0);
44700 SDValue LHS = Op.getOperand(1);
44701 SDValue RHS = Op.getOperand(2);
44702
44703 APInt SignMask = APInt::getSignMask(BitWidth);
44705 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44707 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44709 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44710
44711 if (NewSel || NewLHS || NewRHS) {
44712 NewSel = NewSel ? NewSel : Sel;
44713 NewLHS = NewLHS ? NewLHS : LHS;
44714 NewRHS = NewRHS ? NewRHS : RHS;
44715 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44716 NewSel, NewLHS, NewRHS));
44717 }
44718 break;
44719 }
44720 case X86ISD::PEXTRB:
44721 case X86ISD::PEXTRW: {
44722 SDValue Vec = Op.getOperand(0);
44723 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44724 MVT VecVT = Vec.getSimpleValueType();
44725 unsigned NumVecElts = VecVT.getVectorNumElements();
44726
44727 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44728 unsigned Idx = CIdx->getZExtValue();
44729 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44730
44731 // If we demand no bits from the vector then we must have demanded
44732 // bits from the implict zext - simplify to zero.
44733 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44734 if (DemandedVecBits == 0)
44735 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44736
44737 APInt KnownUndef, KnownZero;
44738 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44739 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44740 KnownZero, TLO, Depth + 1))
44741 return true;
44742
44743 KnownBits KnownVec;
44744 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44745 KnownVec, TLO, Depth + 1))
44746 return true;
44747
44749 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44750 return TLO.CombineTo(
44751 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44752
44753 Known = KnownVec.zext(BitWidth);
44754 return false;
44755 }
44756 break;
44757 }
44758 case X86ISD::PINSRB:
44759 case X86ISD::PINSRW: {
44760 SDValue Vec = Op.getOperand(0);
44761 SDValue Scl = Op.getOperand(1);
44762 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44763 MVT VecVT = Vec.getSimpleValueType();
44764
44765 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44766 unsigned Idx = CIdx->getZExtValue();
44767 if (!OriginalDemandedElts[Idx])
44768 return TLO.CombineTo(Op, Vec);
44769
44770 KnownBits KnownVec;
44771 APInt DemandedVecElts(OriginalDemandedElts);
44772 DemandedVecElts.clearBit(Idx);
44773 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44774 KnownVec, TLO, Depth + 1))
44775 return true;
44776
44777 KnownBits KnownScl;
44778 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44779 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44780 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44781 return true;
44782
44783 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44784 Known = KnownVec.intersectWith(KnownScl);
44785 return false;
44786 }
44787 break;
44788 }
44789 case X86ISD::PACKSS:
44790 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44791 // sign bit then we can just ask for the source operands sign bit.
44792 // TODO - add known bits handling.
44793 if (OriginalDemandedBits.isSignMask()) {
44794 APInt DemandedLHS, DemandedRHS;
44795 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44796
44797 KnownBits KnownLHS, KnownRHS;
44798 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44799 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44800 KnownLHS, TLO, Depth + 1))
44801 return true;
44802 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44803 KnownRHS, TLO, Depth + 1))
44804 return true;
44805
44806 // Attempt to avoid multi-use ops if we don't need anything from them.
44808 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44810 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44811 if (DemandedOp0 || DemandedOp1) {
44812 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44813 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44814 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44815 }
44816 }
44817 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44818 break;
44819 case X86ISD::VBROADCAST: {
44820 SDValue Src = Op.getOperand(0);
44821 MVT SrcVT = Src.getSimpleValueType();
44822 APInt DemandedElts = APInt::getOneBitSet(
44823 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44824 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44825 TLO, Depth + 1))
44826 return true;
44827 // If we don't need the upper bits, attempt to narrow the broadcast source.
44828 // Don't attempt this on AVX512 as it might affect broadcast folding.
44829 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44830 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44831 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44832 Src->hasOneUse()) {
44833 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44834 SDValue NewSrc =
44835 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44836 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44837 SDValue NewBcst =
44838 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44839 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44840 }
44841 break;
44842 }
44843 case X86ISD::PCMPGT:
44844 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44845 // iff we only need the sign bit then we can use R directly.
44846 if (OriginalDemandedBits.isSignMask() &&
44847 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44848 return TLO.CombineTo(Op, Op.getOperand(1));
44849 break;
44850 case X86ISD::MOVMSK: {
44851 SDValue Src = Op.getOperand(0);
44852 MVT SrcVT = Src.getSimpleValueType();
44853 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44854 unsigned NumElts = SrcVT.getVectorNumElements();
44855
44856 // If we don't need the sign bits at all just return zero.
44857 if (OriginalDemandedBits.countr_zero() >= NumElts)
44858 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44859
44860 // See if we only demand bits from the lower 128-bit vector.
44861 if (SrcVT.is256BitVector() &&
44862 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44863 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44864 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44865 }
44866
44867 // Only demand the vector elements of the sign bits we need.
44868 APInt KnownUndef, KnownZero;
44869 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44870 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44871 TLO, Depth + 1))
44872 return true;
44873
44874 Known.Zero = KnownZero.zext(BitWidth);
44875 Known.Zero.setHighBits(BitWidth - NumElts);
44876
44877 // MOVMSK only uses the MSB from each vector element.
44878 KnownBits KnownSrc;
44879 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44880 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44881 Depth + 1))
44882 return true;
44883
44884 if (KnownSrc.One[SrcBits - 1])
44885 Known.One.setLowBits(NumElts);
44886 else if (KnownSrc.Zero[SrcBits - 1])
44887 Known.Zero.setLowBits(NumElts);
44888
44889 // Attempt to avoid multi-use os if we don't need anything from it.
44891 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44892 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44893 return false;
44894 }
44895 case X86ISD::TESTP: {
44896 SDValue Op0 = Op.getOperand(0);
44897 SDValue Op1 = Op.getOperand(1);
44898 MVT OpVT = Op0.getSimpleValueType();
44899 assert((OpVT.getVectorElementType() == MVT::f32 ||
44900 OpVT.getVectorElementType() == MVT::f64) &&
44901 "Illegal vector type for X86ISD::TESTP");
44902
44903 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44904 KnownBits KnownSrc;
44905 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44906 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44907 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44908 AssumeSingleUse) ||
44909 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44910 AssumeSingleUse);
44911 }
44912 case X86ISD::CMOV: {
44913 KnownBits Known2;
44914 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44915 OriginalDemandedElts, Known2, TLO, Depth + 1))
44916 return true;
44917 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44918 OriginalDemandedElts, Known, TLO, Depth + 1))
44919 return true;
44920
44921 // Only known if known in both the LHS and RHS.
44922 Known = Known.intersectWith(Known2);
44923 return false;
44924 }
44925 case X86ISD::BEXTR:
44926 case X86ISD::BEXTRI: {
44927 SDValue Op0 = Op.getOperand(0);
44928 SDValue Op1 = Op.getOperand(1);
44929
44930 // Only bottom 16-bits of the control bits are required.
44931 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44932 // NOTE: SimplifyDemandedBits won't do this for constants.
44933 uint64_t Val1 = Cst1->getZExtValue();
44934 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44935 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44936 SDLoc DL(Op);
44937 return TLO.CombineTo(
44938 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44939 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44940 }
44941
44942 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44943 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44944
44945 // If the length is 0, the result is 0.
44946 if (Length == 0) {
44947 Known.setAllZero();
44948 return false;
44949 }
44950
44951 if ((Shift + Length) <= BitWidth) {
44952 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44953 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44954 return true;
44955
44956 Known = Known.extractBits(Length, Shift);
44957 Known = Known.zextOrTrunc(BitWidth);
44958 return false;
44959 }
44960 } else {
44961 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44962 KnownBits Known1;
44963 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44964 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44965 return true;
44966
44967 // If the length is 0, replace with 0.
44968 KnownBits LengthBits = Known1.extractBits(8, 8);
44969 if (LengthBits.isZero())
44970 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44971 }
44972
44973 break;
44974 }
44975 case X86ISD::PDEP: {
44976 SDValue Op0 = Op.getOperand(0);
44977 SDValue Op1 = Op.getOperand(1);
44978
44979 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44980 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44981
44982 // If the demanded bits has leading zeroes, we don't demand those from the
44983 // mask.
44984 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44985 return true;
44986
44987 // The number of possible 1s in the mask determines the number of LSBs of
44988 // operand 0 used. Undemanded bits from the mask don't matter so filter
44989 // them before counting.
44990 KnownBits Known2;
44991 uint64_t Count = (~Known.Zero & LoMask).popcount();
44992 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44993 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44994 return true;
44995
44996 // Zeroes are retained from the mask, but not ones.
44997 Known.One.clearAllBits();
44998 // The result will have at least as many trailing zeros as the non-mask
44999 // operand since bits can only map to the same or higher bit position.
45000 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
45001 return false;
45002 }
45003 case X86ISD::VPMADD52L:
45004 case X86ISD::VPMADD52H: {
45005 KnownBits KnownOp0, KnownOp1, KnownOp2;
45006 SDValue Op0 = Op.getOperand(0);
45007 SDValue Op1 = Op.getOperand(1);
45008 SDValue Op2 = Op.getOperand(2);
45009 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45010 // operand 2).
45011 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45012 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45013 TLO, Depth + 1))
45014 return true;
45015
45016 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45017 TLO, Depth + 1))
45018 return true;
45019
45020 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45021 KnownOp2, TLO, Depth + 1))
45022 return true;
45023
45024 KnownBits KnownMul;
45025 KnownOp0 = KnownOp0.trunc(52);
45026 KnownOp1 = KnownOp1.trunc(52);
45027 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45028 : KnownBits::mulhu(KnownOp0, KnownOp1);
45029 KnownMul = KnownMul.zext(64);
45030
45031 // lo/hi(X * Y) + Z --> C + Z
45032 if (KnownMul.isConstant()) {
45033 SDLoc DL(Op);
45034 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45035 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45036 }
45037
45038 Known = KnownBits::add(KnownMul, KnownOp2);
45039 return false;
45040 }
45041 }
45042
45044 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45045}
45046
45048 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45049 SelectionDAG &DAG, unsigned Depth) const {
45050 int NumElts = DemandedElts.getBitWidth();
45051 unsigned Opc = Op.getOpcode();
45052 EVT VT = Op.getValueType();
45053
45054 switch (Opc) {
45055 case X86ISD::PINSRB:
45056 case X86ISD::PINSRW: {
45057 // If we don't demand the inserted element, return the base vector.
45058 SDValue Vec = Op.getOperand(0);
45059 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45060 MVT VecVT = Vec.getSimpleValueType();
45061 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45062 !DemandedElts[CIdx->getZExtValue()])
45063 return Vec;
45064 break;
45065 }
45066 case X86ISD::VSHLI: {
45067 // If we are only demanding sign bits then we can use the shift source
45068 // directly.
45069 SDValue Op0 = Op.getOperand(0);
45070 unsigned ShAmt = Op.getConstantOperandVal(1);
45071 unsigned BitWidth = DemandedBits.getBitWidth();
45072 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45073 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45074 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45075 return Op0;
45076 break;
45077 }
45078 case X86ISD::VSRAI:
45079 // iff we only need the sign bit then we can use the source directly.
45080 // TODO: generalize where we only demand extended signbits.
45081 if (DemandedBits.isSignMask())
45082 return Op.getOperand(0);
45083 break;
45084 case X86ISD::PCMPGT:
45085 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45086 // iff we only need the sign bit then we can use R directly.
45087 if (DemandedBits.isSignMask() &&
45088 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45089 return Op.getOperand(1);
45090 break;
45091 case X86ISD::BLENDV: {
45092 // BLENDV: Cond (MSB) ? LHS : RHS
45093 SDValue Cond = Op.getOperand(0);
45094 SDValue LHS = Op.getOperand(1);
45095 SDValue RHS = Op.getOperand(2);
45096
45097 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45098 if (CondKnown.isNegative())
45099 return LHS;
45100 if (CondKnown.isNonNegative())
45101 return RHS;
45102 break;
45103 }
45104 case X86ISD::ANDNP: {
45105 // ANDNP = (~LHS & RHS);
45106 SDValue LHS = Op.getOperand(0);
45107 SDValue RHS = Op.getOperand(1);
45108
45109 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45110 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45111
45112 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45113 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45114 // this context, so return RHS.
45115 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45116 return RHS;
45117 break;
45118 }
45119 }
45120
45121 APInt ShuffleUndef, ShuffleZero;
45122 SmallVector<int, 16> ShuffleMask;
45124 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45125 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45126 // If all the demanded elts are from one operand and are inline,
45127 // then we can use the operand directly.
45128 int NumOps = ShuffleOps.size();
45129 if (ShuffleMask.size() == (unsigned)NumElts &&
45131 return VT.getSizeInBits() == V.getValueSizeInBits();
45132 })) {
45133
45134 if (DemandedElts.isSubsetOf(ShuffleUndef))
45135 return DAG.getUNDEF(VT);
45136 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45137 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45138
45139 // Bitmask that indicates which ops have only been accessed 'inline'.
45140 APInt IdentityOp = APInt::getAllOnes(NumOps);
45141 for (int i = 0; i != NumElts; ++i) {
45142 int M = ShuffleMask[i];
45143 if (!DemandedElts[i] || ShuffleUndef[i])
45144 continue;
45145 int OpIdx = M / NumElts;
45146 int EltIdx = M % NumElts;
45147 if (M < 0 || EltIdx != i) {
45148 IdentityOp.clearAllBits();
45149 break;
45150 }
45151 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45152 if (IdentityOp == 0)
45153 break;
45154 }
45155 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45156 "Multiple identity shuffles detected");
45157
45158 if (IdentityOp != 0)
45159 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45160 }
45161 }
45162
45164 Op, DemandedBits, DemandedElts, DAG, Depth);
45165}
45166
45168 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45169 bool PoisonOnly, unsigned Depth) const {
45170 unsigned NumElts = DemandedElts.getBitWidth();
45171
45172 switch (Op.getOpcode()) {
45174 case X86ISD::Wrapper:
45175 case X86ISD::WrapperRIP:
45176 return true;
45177 case X86ISD::PACKSS:
45178 case X86ISD::PACKUS: {
45179 APInt DemandedLHS, DemandedRHS;
45180 getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
45181 DemandedRHS);
45182 return (!DemandedLHS ||
45183 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
45184 PoisonOnly, Depth + 1)) &&
45185 (!DemandedRHS ||
45186 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
45187 PoisonOnly, Depth + 1));
45188 }
45189 case X86ISD::INSERTPS:
45190 case X86ISD::BLENDI:
45191 case X86ISD::PSHUFB:
45192 case X86ISD::PSHUFD:
45193 case X86ISD::UNPCKL:
45194 case X86ISD::UNPCKH:
45195 case X86ISD::VPERMILPV:
45196 case X86ISD::VPERMILPI:
45197 case X86ISD::VPERMV:
45198 case X86ISD::VPERMV3: {
45201 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45202 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45203 APInt::getZero(NumElts));
45204 for (auto M : enumerate(Mask)) {
45205 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45206 continue;
45207 if (M.value() == SM_SentinelUndef)
45208 return false;
45209 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45210 "Shuffle mask index out of range");
45211 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45212 }
45213 for (auto Op : enumerate(Ops))
45214 if (!DemandedSrcElts[Op.index()].isZero() &&
45216 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45217 return false;
45218 return true;
45219 }
45220 break;
45221 }
45222 }
45224 Op, DemandedElts, DAG, PoisonOnly, Depth);
45225}
45226
45228 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45229 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45230
45231 switch (Op.getOpcode()) {
45232 // SSE bit logic.
45233 case X86ISD::FAND:
45234 case X86ISD::FOR:
45235 case X86ISD::FXOR:
45236 case X86ISD::FANDN:
45237 case X86ISD::ANDNP:
45238 case X86ISD::VPTERNLOG:
45239 return false;
45240 // SSE vector insert/extracts use modulo indices.
45241 case X86ISD::PINSRB:
45242 case X86ISD::PINSRW:
45243 case X86ISD::PEXTRB:
45244 case X86ISD::PEXTRW:
45245 return false;
45246 // SSE vector multiplies are either inbounds or saturate.
45247 case X86ISD::VPMADDUBSW:
45248 case X86ISD::VPMADDWD:
45249 return false;
45250 // SSE vector shifts handle out of bounds shift amounts.
45251 case X86ISD::VSHLI:
45252 case X86ISD::VSRLI:
45253 case X86ISD::VSRAI:
45254 return false;
45255 // SSE blends.
45256 case X86ISD::BLENDI:
45257 case X86ISD::BLENDV:
45258 return false;
45259 // SSE packs.
45260 case X86ISD::PACKSS:
45261 case X86ISD::PACKUS:
45262 return false;
45263 // SSE target shuffles.
45264 case X86ISD::INSERTPS:
45265 case X86ISD::PSHUFB:
45266 case X86ISD::PSHUFD:
45267 case X86ISD::UNPCKL:
45268 case X86ISD::UNPCKH:
45269 case X86ISD::VPERMILPV:
45270 case X86ISD::VPERMILPI:
45271 case X86ISD::VPERMV:
45272 case X86ISD::VPERMV3:
45273 return false;
45274 // SSE comparisons handle all icmp/fcmp cases.
45275 // TODO: Add CMPM/MM with test coverage.
45276 case X86ISD::CMPP:
45277 case X86ISD::PCMPEQ:
45278 case X86ISD::PCMPGT:
45279 return false;
45280 // SSE signbit extraction.
45281 case X86ISD::MOVMSK:
45282 return false;
45283 // GFNI instructions.
45286 case X86ISD::GF2P8MULB:
45287 return false;
45289 switch (Op->getConstantOperandVal(0)) {
45290 case Intrinsic::x86_sse2_pmadd_wd:
45291 case Intrinsic::x86_avx2_pmadd_wd:
45292 case Intrinsic::x86_avx512_pmaddw_d_512:
45293 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45294 case Intrinsic::x86_avx2_pmadd_ub_sw:
45295 case Intrinsic::x86_avx512_pmaddubs_w_512:
45296 return false;
45297 case Intrinsic::x86_avx512_vpermi2var_d_128:
45298 case Intrinsic::x86_avx512_vpermi2var_d_256:
45299 case Intrinsic::x86_avx512_vpermi2var_d_512:
45300 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45301 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45302 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45303 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45304 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45305 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45306 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45307 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45308 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45309 case Intrinsic::x86_avx512_vpermi2var_q_128:
45310 case Intrinsic::x86_avx512_vpermi2var_q_256:
45311 case Intrinsic::x86_avx512_vpermi2var_q_512:
45312 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45313 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45314 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45315 return false;
45316 }
45317 }
45319 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45320}
45321
45323 const APInt &DemandedElts,
45324 APInt &UndefElts,
45325 const SelectionDAG &DAG,
45326 unsigned Depth) const {
45327 unsigned NumElts = DemandedElts.getBitWidth();
45328 unsigned Opc = Op.getOpcode();
45329
45330 switch (Opc) {
45331 case X86ISD::VBROADCAST:
45333 UndefElts = APInt::getZero(NumElts);
45334 return true;
45335 }
45336
45337 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45338 DAG, Depth);
45339}
45340
45341// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45342// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45343static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45344 bool AllowTruncate, unsigned Depth) {
45345 // Limit recursion.
45347 return false;
45348 switch (Src.getOpcode()) {
45349 case ISD::TRUNCATE:
45350 if (!AllowTruncate)
45351 return false;
45352 [[fallthrough]];
45353 case ISD::SETCC:
45354 return Src.getOperand(0).getValueSizeInBits() == Size;
45355 case ISD::FREEZE:
45356 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45357 Depth + 1);
45358 case ISD::AND:
45359 case ISD::XOR:
45360 case ISD::OR:
45361 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45362 Depth + 1) &&
45363 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45364 Depth + 1);
45365 case ISD::SELECT:
45366 case ISD::VSELECT:
45367 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45368 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45369 Depth + 1) &&
45370 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45371 Depth + 1);
45372 case ISD::BUILD_VECTOR:
45373 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45374 ISD::isBuildVectorAllOnes(Src.getNode());
45375 }
45376 return false;
45377}
45378
45379// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45380static unsigned getAltBitOpcode(unsigned Opcode) {
45381 switch(Opcode) {
45382 // clang-format off
45383 case ISD::AND: return X86ISD::FAND;
45384 case ISD::OR: return X86ISD::FOR;
45385 case ISD::XOR: return X86ISD::FXOR;
45386 case X86ISD::ANDNP: return X86ISD::FANDN;
45387 // clang-format on
45388 }
45389 llvm_unreachable("Unknown bitwise opcode");
45390}
45391
45392// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45394 const SDLoc &DL) {
45395 EVT SrcVT = Src.getValueType();
45396 if (SrcVT != MVT::v4i1)
45397 return SDValue();
45398
45399 switch (Src.getOpcode()) {
45400 case ISD::SETCC:
45401 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45402 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45403 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45404 SDValue Op0 = Src.getOperand(0);
45405 if (ISD::isNormalLoad(Op0.getNode()))
45406 return DAG.getBitcast(MVT::v4f32, Op0);
45407 if (Op0.getOpcode() == ISD::BITCAST &&
45408 Op0.getOperand(0).getValueType() == MVT::v4f32)
45409 return Op0.getOperand(0);
45410 }
45411 break;
45412 case ISD::AND:
45413 case ISD::XOR:
45414 case ISD::OR: {
45415 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45416 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45417 if (Op0 && Op1)
45418 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45419 Op1);
45420 break;
45421 }
45422 }
45423 return SDValue();
45424}
45425
45426// Helper to push sign extension of vXi1 SETCC result through bitops.
45428 SDValue Src, const SDLoc &DL) {
45429 switch (Src.getOpcode()) {
45430 case ISD::SETCC:
45431 case ISD::FREEZE:
45432 case ISD::TRUNCATE:
45433 case ISD::BUILD_VECTOR:
45434 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45435 case ISD::AND:
45436 case ISD::XOR:
45437 case ISD::OR:
45438 return DAG.getNode(
45439 Src.getOpcode(), DL, SExtVT,
45440 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45441 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45442 case ISD::SELECT:
45443 case ISD::VSELECT:
45444 return DAG.getSelect(
45445 DL, SExtVT, Src.getOperand(0),
45446 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45447 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45448 }
45449 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45450}
45451
45452// Try to match patterns such as
45453// (i16 bitcast (v16i1 x))
45454// ->
45455// (i16 movmsk (16i8 sext (v16i1 x)))
45456// before the illegal vector is scalarized on subtargets that don't have legal
45457// vxi1 types.
45459 const SDLoc &DL,
45460 const X86Subtarget &Subtarget) {
45461 EVT SrcVT = Src.getValueType();
45462 if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
45463 SrcVT.getScalarType() != MVT::i1)
45464 return SDValue();
45465
45466 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45467 // legalization destroys the v4i32 type.
45468 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45469 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45470 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45471 DAG.getBitcast(MVT::v4f32, V));
45472 return DAG.getZExtOrTrunc(V, DL, VT);
45473 }
45474 }
45475
45476 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45477 // movmskb even with avx512. This will be better than truncating to vXi1 and
45478 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45479 // vpcmpeqb/vpcmpgtb.
45480 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45481 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45482 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45483 Src.getOperand(0).getValueType() == MVT::v64i8);
45484
45485 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45486 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45487 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45488 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45489 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45490 EVT CmpVT = Src.getOperand(0).getValueType();
45491 EVT EltVT = CmpVT.getVectorElementType();
45492 if (CmpVT.getSizeInBits() <= 256 &&
45493 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45494 PreferMovMsk = true;
45495 }
45496
45497 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45498 // MOVMSK is supported in SSE2 or later.
45499 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45500 return SDValue();
45501
45502 // If the upper ops of a concatenation are undef, then try to bitcast the
45503 // lower op and extend.
45504 SmallVector<SDValue, 4> SubSrcOps;
45505 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45506 SubSrcOps.size() >= 2) {
45507 SDValue LowerOp = SubSrcOps[0];
45508 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45509 if (LowerOp.getOpcode() == ISD::SETCC &&
45510 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45511 EVT SubVT = VT.getIntegerVT(
45512 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45513 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45514 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45515 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45516 }
45517 }
45518 }
45519
45520 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45521 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45522 // v8i16 and v16i16.
45523 // For these two cases, we can shuffle the upper element bytes to a
45524 // consecutive sequence at the start of the vector and treat the results as
45525 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45526 // for v16i16 this is not the case, because the shuffle is expensive, so we
45527 // avoid sign-extending to this type entirely.
45528 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45529 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45530 MVT SExtVT;
45531 bool PropagateSExt = false;
45532 switch (SrcVT.getSimpleVT().SimpleTy) {
45533 default:
45534 return SDValue();
45535 case MVT::v2i1:
45536 SExtVT = MVT::v2i64;
45537 break;
45538 case MVT::v4i1:
45539 SExtVT = MVT::v4i32;
45540 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45541 // sign-extend to a 256-bit operation to avoid truncation.
45542 if (Subtarget.hasAVX() &&
45543 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45544 SExtVT = MVT::v4i64;
45545 PropagateSExt = true;
45546 }
45547 break;
45548 case MVT::v8i1:
45549 SExtVT = MVT::v8i16;
45550 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45551 // sign-extend to a 256-bit operation to match the compare.
45552 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45553 // 256-bit because the shuffle is cheaper than sign extending the result of
45554 // the compare.
45555 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45556 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45557 SExtVT = MVT::v8i32;
45558 PropagateSExt = true;
45559 }
45560 break;
45561 case MVT::v16i1:
45562 SExtVT = MVT::v16i8;
45563 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45564 // it is not profitable to sign-extend to 256-bit because this will
45565 // require an extra cross-lane shuffle which is more expensive than
45566 // truncating the result of the compare to 128-bits.
45567 break;
45568 case MVT::v32i1:
45569 SExtVT = MVT::v32i8;
45570 break;
45571 case MVT::v64i1:
45572 // If we have AVX512F, but not AVX512BW and the input is truncated from
45573 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45574 if (Subtarget.hasAVX512()) {
45575 if (Subtarget.hasBWI())
45576 return SDValue();
45577 SExtVT = MVT::v64i8;
45578 break;
45579 }
45580 // Split if this is a <64 x i8> comparison result.
45581 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45582 SExtVT = MVT::v64i8;
45583 break;
45584 }
45585 return SDValue();
45586 };
45587
45588 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45589 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45590
45591 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45592 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45593 } else {
45594 if (SExtVT == MVT::v8i16) {
45595 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45596 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45597 }
45598 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45599 }
45600
45601 EVT IntVT =
45603 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45604 return DAG.getBitcast(VT, V);
45605}
45606
45607// Convert a vXi1 constant build vector to the same width scalar integer.
45609 EVT SrcVT = Op.getValueType();
45610 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45611 "Expected a vXi1 vector");
45613 "Expected a constant build vector");
45614
45615 APInt Imm(SrcVT.getVectorNumElements(), 0);
45616 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45617 SDValue In = Op.getOperand(Idx);
45618 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45619 Imm.setBit(Idx);
45620 }
45621 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45622 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45623}
45624
45627 const X86Subtarget &Subtarget) {
45628 using namespace SDPatternMatch;
45629 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45630
45631 if (!DCI.isBeforeLegalizeOps())
45632 return SDValue();
45633
45634 // Only do this if we have k-registers.
45635 if (!Subtarget.hasAVX512())
45636 return SDValue();
45637
45638 EVT DstVT = N->getValueType(0);
45639 SDValue Op = N->getOperand(0);
45640 EVT SrcVT = Op.getValueType();
45641
45642 // Make sure we have a bitcast between mask registers and a scalar type.
45643 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45644 DstVT.isScalarInteger()) &&
45645 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45646 SrcVT.isScalarInteger()))
45647 return SDValue();
45648
45649 SDValue LHS, RHS;
45650
45651 // Look for logic ops.
45653 return SDValue();
45654
45655 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45656 // least one of the getBitcast() will fold away).
45657 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45659 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45660 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45661
45662 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45663 // Most of these have to move a constant from the scalar domain anyway.
45666 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45667 DAG.getBitcast(DstVT, LHS), RHS);
45668 }
45669
45670 return SDValue();
45671}
45672
45674 const X86Subtarget &Subtarget) {
45675 SDLoc DL(BV);
45676 unsigned NumElts = BV->getNumOperands();
45677 SDValue Splat = BV->getSplatValue();
45678
45679 // Build MMX element from integer GPR or SSE float values.
45680 auto CreateMMXElement = [&](SDValue V) {
45681 if (V.isUndef())
45682 return DAG.getUNDEF(MVT::x86mmx);
45683 if (V.getValueType().isFloatingPoint()) {
45684 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45685 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45686 V = DAG.getBitcast(MVT::v2i64, V);
45687 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45688 }
45689 V = DAG.getBitcast(MVT::i32, V);
45690 } else {
45691 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45692 }
45693 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45694 };
45695
45696 // Convert build vector ops to MMX data in the bottom elements.
45698
45699 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45700
45701 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45702 if (Splat) {
45703 if (Splat.isUndef())
45704 return DAG.getUNDEF(MVT::x86mmx);
45705
45706 Splat = CreateMMXElement(Splat);
45707
45708 if (Subtarget.hasSSE1()) {
45709 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45710 if (NumElts == 8)
45711 Splat = DAG.getNode(
45712 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45713 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45714 TLI.getPointerTy(DAG.getDataLayout())),
45715 Splat, Splat);
45716
45717 // Use PSHUFW to repeat 16-bit elements.
45718 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45719 return DAG.getNode(
45720 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45721 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45722 TLI.getPointerTy(DAG.getDataLayout())),
45723 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45724 }
45725 Ops.append(NumElts, Splat);
45726 } else {
45727 for (unsigned i = 0; i != NumElts; ++i)
45728 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45729 }
45730
45731 // Use tree of PUNPCKLs to build up general MMX vector.
45732 while (Ops.size() > 1) {
45733 unsigned NumOps = Ops.size();
45734 unsigned IntrinOp =
45735 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45736 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45737 : Intrinsic::x86_mmx_punpcklbw));
45738 SDValue Intrin = DAG.getTargetConstant(
45739 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45740 for (unsigned i = 0; i != NumOps; i += 2)
45741 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45742 Ops[i], Ops[i + 1]);
45743 Ops.resize(NumOps / 2);
45744 }
45745
45746 return Ops[0];
45747}
45748
45749// Recursive function that attempts to find if a bool vector node was originally
45750// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45751// integer. If so, replace the scalar ops with bool vector equivalents back down
45752// the chain.
45754 SelectionDAG &DAG,
45755 const X86Subtarget &Subtarget,
45756 unsigned Depth = 0) {
45758 return SDValue(); // Limit search depth.
45759
45760 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45761 unsigned Opc = V.getOpcode();
45762 switch (Opc) {
45763 case ISD::BITCAST: {
45764 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45765 SDValue Src = V.getOperand(0);
45766 EVT SrcVT = Src.getValueType();
45767 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45768 return DAG.getBitcast(VT, Src);
45769 break;
45770 }
45771 case ISD::Constant: {
45772 auto *C = cast<ConstantSDNode>(V);
45773 if (C->isZero())
45774 return DAG.getConstant(0, DL, VT);
45775 if (C->isAllOnes())
45776 return DAG.getAllOnesConstant(DL, VT);
45777 break;
45778 }
45779 case ISD::TRUNCATE: {
45780 // If we find a suitable source, a truncated scalar becomes a subvector.
45781 SDValue Src = V.getOperand(0);
45782 EVT NewSrcVT =
45783 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45784 if (TLI.isTypeLegal(NewSrcVT))
45785 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45786 Subtarget, Depth + 1))
45787 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45788 DAG.getVectorIdxConstant(0, DL));
45789 break;
45790 }
45791 case ISD::ANY_EXTEND:
45792 case ISD::ZERO_EXTEND: {
45793 // If we find a suitable source, an extended scalar becomes a subvector.
45794 SDValue Src = V.getOperand(0);
45795 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45796 Src.getScalarValueSizeInBits());
45797 if (TLI.isTypeLegal(NewSrcVT))
45798 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45799 Subtarget, Depth + 1))
45800 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45801 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45802 : DAG.getConstant(0, DL, VT),
45803 N0, DAG.getVectorIdxConstant(0, DL));
45804 break;
45805 }
45806 case ISD::OR:
45807 case ISD::XOR: {
45808 // If we find suitable sources, we can just move the op to the vector
45809 // domain.
45810 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45811 Subtarget, Depth + 1))
45812 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45813 Subtarget, Depth + 1))
45814 return DAG.getNode(Opc, DL, VT, N0, N1);
45815 break;
45816 }
45817 case ISD::SHL: {
45818 // If we find a suitable source, a SHL becomes a KSHIFTL.
45819 SDValue Src0 = V.getOperand(0);
45820 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45821 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45822 break;
45823
45824 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45825 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45826 Depth + 1))
45827 return DAG.getNode(
45828 X86ISD::KSHIFTL, DL, VT, N0,
45829 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45830 break;
45831 }
45832 }
45833
45834 // Does the inner bitcast already exist?
45835 if (Depth > 0)
45836 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45837 return SDValue(Alt, 0);
45838
45839 return SDValue();
45840}
45841
45844 const X86Subtarget &Subtarget) {
45845 SDValue N0 = N->getOperand(0);
45846 EVT VT = N->getValueType(0);
45847 EVT SrcVT = N0.getValueType();
45848 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45849
45850 // Try to match patterns such as
45851 // (i16 bitcast (v16i1 x))
45852 // ->
45853 // (i16 movmsk (16i8 sext (v16i1 x)))
45854 // before the setcc result is scalarized on subtargets that don't have legal
45855 // vxi1 types.
45856 if (DCI.isBeforeLegalize()) {
45857 SDLoc dl(N);
45858 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45859 return V;
45860
45861 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45862 // type, widen both sides to avoid a trip through memory.
45863 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45864 Subtarget.hasAVX512()) {
45865 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45866 N0 = DAG.getBitcast(MVT::v8i1, N0);
45867 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45868 DAG.getVectorIdxConstant(0, dl));
45869 }
45870
45871 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45872 // type, widen both sides to avoid a trip through memory.
45873 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45874 Subtarget.hasAVX512()) {
45875 // Use zeros for the widening if we already have some zeroes. This can
45876 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45877 // stream of this.
45878 // FIXME: It might make sense to detect a concat_vectors with a mix of
45879 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45880 // a separate combine. What we can't do is canonicalize the operands of
45881 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45882 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45883 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45884 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45885 SrcVT = LastOp.getValueType();
45886 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45888 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45889 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45890 N0 = DAG.getBitcast(MVT::i8, N0);
45891 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45892 }
45893 }
45894
45895 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45896 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45897 Ops[0] = N0;
45898 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45899 N0 = DAG.getBitcast(MVT::i8, N0);
45900 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45901 }
45902 } else if (DCI.isAfterLegalizeDAG()) {
45903 // If we're bitcasting from iX to vXi1, see if the integer originally
45904 // began as a vXi1 and whether we can remove the bitcast entirely.
45905 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45906 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45907 if (SDValue V =
45908 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45909 return V;
45910 }
45911 }
45912
45913 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45914 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45915 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45916 // we can help with known bits propagation from the vXi1 domain to the
45917 // scalar domain.
45918 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45919 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45920 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45922 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45923 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45924
45925 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45926 // and the vbroadcast_load are both integer or both fp. In some cases this
45927 // will remove the bitcast entirely.
45928 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45929 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45930 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45931 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45932 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45933 // Don't swap i8/i16 since don't have fp types that size.
45934 if (MemSize >= 32) {
45935 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45936 : MVT::getIntegerVT(MemSize);
45937 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45938 : MVT::getIntegerVT(SrcVTSize);
45939 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45940
45941 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45942 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45943 SDValue ResNode =
45945 MemVT, BCast->getMemOperand());
45946 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45947 return DAG.getBitcast(VT, ResNode);
45948 }
45949 }
45950
45951 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45952 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45953 SDValue Src = peekThroughTruncates(N0);
45954 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45955 Src.getOperand(0).getValueSizeInBits() == 128 &&
45956 isNullConstant(Src.getOperand(1))) {
45957 SDLoc DL(N);
45958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45959 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45960 DAG.getVectorIdxConstant(0, DL));
45961 }
45962 }
45963
45964 // Since MMX types are special and don't usually play with other vector types,
45965 // it's better to handle them early to be sure we emit efficient code by
45966 // avoiding store-load conversions.
45967 if (VT == MVT::x86mmx) {
45968 // Detect MMX constant vectors.
45969 APInt UndefElts;
45970 SmallVector<APInt, 1> EltBits;
45971 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45972 /*AllowWholeUndefs*/ true,
45973 /*AllowPartialUndefs*/ true)) {
45974 SDLoc DL(N0);
45975 // Handle zero-extension of i32 with MOVD.
45976 if (EltBits[0].countl_zero() >= 32)
45977 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45978 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45979 // Else, bitcast to a double.
45980 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45981 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45982 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45983 }
45984
45985 // Detect bitcasts to x86mmx low word.
45986 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45987 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45988 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45989 bool LowUndef = true, AllUndefOrZero = true;
45990 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45991 SDValue Op = N0.getOperand(i);
45992 LowUndef &= Op.isUndef() || (i >= e/2);
45993 AllUndefOrZero &= isNullConstantOrUndef(Op);
45994 }
45995 if (AllUndefOrZero) {
45996 SDValue N00 = N0.getOperand(0);
45997 SDLoc dl(N00);
45998 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45999 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
46000 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
46001 }
46002 }
46003
46004 // Detect bitcasts of 64-bit build vectors and convert to a
46005 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
46006 // lowest element.
46007 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
46008 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
46009 SrcVT == MVT::v8i8))
46010 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
46011
46012 // Detect bitcasts between element or subvector extraction to x86mmx.
46013 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46015 isNullConstant(N0.getOperand(1))) {
46016 SDValue N00 = N0.getOperand(0);
46017 if (N00.getValueType().is128BitVector())
46018 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46019 DAG.getBitcast(MVT::v2i64, N00));
46020 }
46021
46022 // Detect bitcasts from FP_TO_SINT to x86mmx.
46023 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46024 SDLoc DL(N0);
46025 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46026 DAG.getUNDEF(MVT::v2i32));
46027 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46028 DAG.getBitcast(MVT::v2i64, Res));
46029 }
46030 }
46031
46032 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46033 // most of these to scalar anyway.
46034 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46035 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46037 return combinevXi1ConstantToInteger(N0, DAG);
46038 }
46039
46040 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46041 VT.getVectorElementType() == MVT::i1) {
46042 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46043 if (C->isAllOnes())
46044 return DAG.getConstant(1, SDLoc(N0), VT);
46045 if (C->isZero())
46046 return DAG.getConstant(0, SDLoc(N0), VT);
46047 }
46048 }
46049
46050 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46051 // Turn it into a sign bit compare that produces a k-register. This avoids
46052 // a trip through a GPR.
46053 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46054 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46056 unsigned NumElts = VT.getVectorNumElements();
46057 SDValue Src = N0;
46058
46059 // Peek through truncate.
46060 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46061 Src = N0.getOperand(0);
46062
46063 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46064 SDValue MovmskIn = Src.getOperand(0);
46065 MVT MovmskVT = MovmskIn.getSimpleValueType();
46066 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46067
46068 // We allow extra bits of the movmsk to be used since they are known zero.
46069 // We can't convert a VPMOVMSKB without avx512bw.
46070 if (MovMskElts <= NumElts &&
46071 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46072 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46073 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46074 SDLoc dl(N);
46075 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46076 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46077 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46078 if (EVT(CmpVT) == VT)
46079 return Cmp;
46080
46081 // Pad with zeroes up to original VT to replace the zeroes that were
46082 // being used from the MOVMSK.
46083 unsigned NumConcats = NumElts / MovMskElts;
46084 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46085 Ops[0] = Cmp;
46086 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46087 }
46088 }
46089 }
46090
46091 // Try to remove bitcasts from input and output of mask arithmetic to
46092 // remove GPR<->K-register crossings.
46093 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46094 return V;
46095
46096 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46097 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46098 SrcVT.getVectorNumElements() == 1)
46099 return N0.getOperand(1);
46100
46101 // Convert a bitcasted integer logic operation that has one bitcasted
46102 // floating-point operand into a floating-point logic operation. This may
46103 // create a load of a constant, but that is cheaper than materializing the
46104 // constant in an integer register and transferring it to an SSE register or
46105 // transferring the SSE operand to integer register and back.
46106 unsigned FPOpcode;
46107 switch (N0.getOpcode()) {
46108 // clang-format off
46109 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46110 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46111 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46112 default: return SDValue();
46113 // clang-format on
46114 }
46115
46116 // Check if we have a bitcast from another integer type as well.
46117 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46118 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46119 (Subtarget.hasFP16() && VT == MVT::f16) ||
46120 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46121 TLI.isTypeLegal(VT))))
46122 return SDValue();
46123
46124 SDValue LogicOp0 = N0.getOperand(0);
46125 SDValue LogicOp1 = N0.getOperand(1);
46126 SDLoc DL0(N0);
46127
46128 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46129 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46130 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46131 LogicOp0.getOperand(0).getValueType() == VT &&
46132 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46133 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46134 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46135 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46136 }
46137 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46138 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46139 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46140 LogicOp1.getOperand(0).getValueType() == VT &&
46141 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46142 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46143 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46144 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46145 }
46146
46147 return SDValue();
46148}
46149
46150// (mul (zext a), (sext, b))
46151static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46152 SDValue &Op1) {
46153 Op0 = Mul.getOperand(0);
46154 Op1 = Mul.getOperand(1);
46155
46156 // The operand1 should be signed extend
46157 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46158 std::swap(Op0, Op1);
46159
46160 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46161 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46162 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46163 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46164 return true;
46165
46166 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46167 return (BV && BV->isConstant());
46168 };
46169
46170 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46171 // value, we need to check Op0 is zero extended value. Op1 should be signed
46172 // value, so we just check the signed bits.
46173 if ((IsFreeTruncation(Op0) &&
46174 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46175 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46176 return true;
46177
46178 return false;
46179}
46180
46182 unsigned &LogBias, const SDLoc &DL,
46183 const X86Subtarget &Subtarget) {
46184 // Extend or truncate to MVT::i8 first.
46185 MVT Vi8VT =
46186 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46187 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46188 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46189
46190 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46191 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46192 // The src A, B element type is i8, but the dst C element type is i32.
46193 // When we calculate the reduce stage, we use src vector type vXi8 for it
46194 // so we need logbias 2 to avoid extra 2 stages.
46195 LogBias = 2;
46196
46197 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46198 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46199 RegSize = std::max(512u, RegSize);
46200
46201 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46202 // fill in the missing vector elements with 0.
46203 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46204 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46205 Ops[0] = LHS;
46206 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46207 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46208 Ops[0] = RHS;
46209 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46210
46211 // Actually build the DotProduct, split as 256/512 bits for
46212 // AVXVNNI/AVX512VNNI.
46213 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46215 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46216 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46217 };
46218 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46219 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46220
46221 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46222 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46223}
46224
46225// Create a PSADBW given two sources representable as zexts of vXi8.
46227 const SDLoc &DL, const X86Subtarget &Subtarget) {
46228 // Find the appropriate width for the PSADBW.
46229 EVT DstVT = N0.getValueType();
46230 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46231 DstVT.getVectorElementCount());
46232 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46233
46234 // Widen the vXi8 vectors, padding with zero vector elements.
46235 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46236 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46237 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46238 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46239 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46240 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46241 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46242
46243 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46244 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46246 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46247 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46248 };
46249 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46250 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46251 PSADBWBuilder);
46252}
46253
46254// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46255// PHMINPOSUW.
46257 const X86Subtarget &Subtarget) {
46258 // Bail without SSE41.
46259 if (!Subtarget.hasSSE41())
46260 return SDValue();
46261
46262 EVT ExtractVT = Extract->getValueType(0);
46263 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46264 return SDValue();
46265
46266 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46267 ISD::NodeType BinOp;
46268 SDValue Src = DAG.matchBinOpReduction(
46269 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46270 if (!Src)
46271 return SDValue();
46272
46273 EVT SrcVT = Src.getValueType();
46274 EVT SrcSVT = SrcVT.getScalarType();
46275 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46276 return SDValue();
46277
46278 SDLoc DL(Extract);
46279 SDValue MinPos = Src;
46280
46281 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46282 while (SrcVT.getSizeInBits() > 128) {
46283 SDValue Lo, Hi;
46284 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46285 SrcVT = Lo.getValueType();
46286 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46287 }
46288 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46289 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46290 "Unexpected value type");
46291
46292 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46293 // to flip the value accordingly.
46294 SDValue Mask;
46295 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46296 if (BinOp == ISD::SMAX)
46297 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46298 else if (BinOp == ISD::SMIN)
46299 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46300 else if (BinOp == ISD::UMAX)
46301 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46302
46303 if (Mask)
46304 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46305
46306 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46307 // shuffling each upper element down and insert zeros. This means that the
46308 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46309 // ready for the PHMINPOS.
46310 if (ExtractVT == MVT::i8) {
46312 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46313 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46314 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46315 }
46316
46317 // Perform the PHMINPOS on a v8i16 vector,
46318 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46319 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46320 MinPos = DAG.getBitcast(SrcVT, MinPos);
46321
46322 if (Mask)
46323 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46324
46325 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46326 DAG.getVectorIdxConstant(0, DL));
46327}
46328
46329// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46331 const X86Subtarget &Subtarget) {
46332 // Bail without SSE2.
46333 if (!Subtarget.hasSSE2())
46334 return SDValue();
46335
46336 EVT ExtractVT = Extract->getValueType(0);
46337 unsigned BitWidth = ExtractVT.getSizeInBits();
46338 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46339 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46340 return SDValue();
46341
46342 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46343 ISD::NodeType BinOp;
46344 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46345 if (!Match && ExtractVT == MVT::i1)
46346 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46347 if (!Match)
46348 return SDValue();
46349
46350 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46351 // which we can't support here for now.
46352 if (Match.getScalarValueSizeInBits() != BitWidth)
46353 return SDValue();
46354
46355 SDValue Movmsk;
46356 SDLoc DL(Extract);
46357 EVT MatchVT = Match.getValueType();
46358 unsigned NumElts = MatchVT.getVectorNumElements();
46359 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46360 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46361 LLVMContext &Ctx = *DAG.getContext();
46362
46363 if (ExtractVT == MVT::i1) {
46364 // Special case for (pre-legalization) vXi1 reductions.
46365 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46366 return SDValue();
46367 if (Match.getOpcode() == ISD::SETCC) {
46368 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46369 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46370 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46371 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46372 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46373 X86::CondCode X86CC;
46374 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46375 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46376 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46377 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46378 DAG, X86CC))
46379 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46380 getSETCC(X86CC, V, DL, DAG));
46381 }
46382 }
46383 if (TLI.isTypeLegal(MatchVT)) {
46384 // If this is a legal AVX512 predicate type then we can just bitcast.
46385 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46386 Movmsk = DAG.getBitcast(MovmskVT, Match);
46387 } else {
46388 // Use combineBitcastvxi1 to create the MOVMSK.
46389 while (NumElts > MaxElts) {
46390 SDValue Lo, Hi;
46391 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46392 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46393 NumElts /= 2;
46394 }
46395 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46396 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46397 }
46398 if (!Movmsk)
46399 return SDValue();
46400 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46401 } else {
46402 // FIXME: Better handling of k-registers or 512-bit vectors?
46403 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46404 if (!(MatchSizeInBits == 128 ||
46405 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46406 return SDValue();
46407
46408 // Make sure this isn't a vector of 1 element. The perf win from using
46409 // MOVMSK diminishes with less elements in the reduction, but it is
46410 // generally better to get the comparison over to the GPRs as soon as
46411 // possible to reduce the number of vector ops.
46412 if (Match.getValueType().getVectorNumElements() < 2)
46413 return SDValue();
46414
46415 // Check that we are extracting a reduction of all sign bits.
46416 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46417 return SDValue();
46418
46419 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46420 SDValue Lo, Hi;
46421 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46422 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46423 MatchSizeInBits = Match.getValueSizeInBits();
46424 }
46425
46426 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46427 MVT MaskSrcVT;
46428 if (64 == BitWidth || 32 == BitWidth)
46430 MatchSizeInBits / BitWidth);
46431 else
46432 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46433
46434 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46435 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46436 NumElts = MaskSrcVT.getVectorNumElements();
46437 }
46438 assert((NumElts <= 32 || NumElts == 64) &&
46439 "Not expecting more than 64 elements");
46440
46441 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46442 if (BinOp == ISD::XOR) {
46443 // parity -> (PARITY(MOVMSK X))
46444 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46445 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46446 }
46447
46448 SDValue CmpC;
46449 ISD::CondCode CondCode;
46450 if (BinOp == ISD::OR) {
46451 // any_of -> MOVMSK != 0
46452 CmpC = DAG.getConstant(0, DL, CmpVT);
46453 CondCode = ISD::CondCode::SETNE;
46454 } else {
46455 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46456 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46457 DL, CmpVT);
46458 CondCode = ISD::CondCode::SETEQ;
46459 }
46460
46461 // The setcc produces an i8 of 0/1, so extend that to the result width and
46462 // negate to get the final 0/-1 mask value.
46463 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46464 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46465 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46466 return DAG.getNegative(Zext, DL, ExtractVT);
46467}
46468
46470 const X86Subtarget &Subtarget) {
46471 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46472 return SDValue();
46473
46474 EVT ExtractVT = Extract->getValueType(0);
46475 // Verify the type we're extracting is i32, as the output element type of
46476 // vpdpbusd is i32.
46477 if (ExtractVT != MVT::i32)
46478 return SDValue();
46479
46480 EVT VT = Extract->getOperand(0).getValueType();
46482 return SDValue();
46483
46484 // Match shuffle + add pyramid.
46485 ISD::NodeType BinOp;
46486 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46487
46488 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46489 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46490 // before adding into the accumulator.
46491 // TODO:
46492 // We also need to verify that the multiply has at least 2x the number of bits
46493 // of the input. We shouldn't match
46494 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46495 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46496 // Root = Root.getOperand(0);
46497
46498 // If there was a match, we want Root to be a mul.
46499 if (!Root || Root.getOpcode() != ISD::MUL)
46500 return SDValue();
46501
46502 // Check whether we have an extend and mul pattern
46503 SDValue LHS, RHS;
46504 if (!detectExtMul(DAG, Root, LHS, RHS))
46505 return SDValue();
46506
46507 // Create the dot product instruction.
46508 SDLoc DL(Extract);
46509 unsigned StageBias;
46510 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46511
46512 // If the original vector was wider than 4 elements, sum over the results
46513 // in the DP vector.
46514 unsigned Stages = Log2_32(VT.getVectorNumElements());
46515 EVT DpVT = DP.getValueType();
46516
46517 if (Stages > StageBias) {
46518 unsigned DpElems = DpVT.getVectorNumElements();
46519
46520 for (unsigned i = Stages - StageBias; i > 0; --i) {
46521 SmallVector<int, 16> Mask(DpElems, -1);
46522 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46523 Mask[j] = MaskEnd + j;
46524
46525 SDValue Shuffle =
46526 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46527 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46528 }
46529 }
46530
46531 // Return the lowest ExtractSizeInBits bits.
46532 EVT ResVT =
46533 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46534 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46535 DP = DAG.getBitcast(ResVT, DP);
46536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46537 Extract->getOperand(1));
46538}
46539
46541 const X86Subtarget &Subtarget) {
46542 using namespace SDPatternMatch;
46543
46544 // PSADBW is only supported on SSE2 and up.
46545 if (!Subtarget.hasSSE2())
46546 return SDValue();
46547
46548 EVT ExtractVT = Extract->getValueType(0);
46549 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46550 ExtractVT != MVT::i64)
46551 return SDValue();
46552
46553 EVT VT = Extract->getOperand(0).getValueType();
46555 return SDValue();
46556
46557 // Match shuffle + add pyramid.
46558 ISD::NodeType BinOp;
46559 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46560 if (!Root)
46561 return SDValue();
46562
46563 // The operand is expected to be zero extended from i8.
46564 // In order to convert to i64 and above, additional any/zero/sign
46565 // extend is expected.
46566 // The zero extend from 32 bit has no mathematical effect on the result.
46567 // Also the sign extend is basically zero extend
46568 // (extends the sign bit which is zero).
46569 // So it is correct to skip the sign/zero extend instruction.
46570 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46571 Root.getOpcode() == ISD::ZERO_EXTEND ||
46572 Root.getOpcode() == ISD::ANY_EXTEND)
46573 Root = Root.getOperand(0);
46574
46575 // Check whether we have an vXi8 abdu pattern.
46576 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46577 SDValue Src0, Src1;
46578 if (!sd_match(
46579 Root,
46580 m_AnyOf(
46582 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46584 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46585 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46586 m_Abs(
46587 m_Sub(m_AllOf(m_Value(Src0),
46589 m_AllOf(m_Value(Src1),
46590 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46591 return SDValue();
46592
46593 // Create the SAD instruction.
46594 SDLoc DL(Extract);
46595 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46596
46597 // If the original vector was wider than 8 elements, sum over the results
46598 // in the SAD vector.
46599 unsigned Stages = Log2_32(VT.getVectorNumElements());
46600 EVT SadVT = SAD.getValueType();
46601 if (Stages > 3) {
46602 unsigned SadElems = SadVT.getVectorNumElements();
46603
46604 for(unsigned i = Stages - 3; i > 0; --i) {
46605 SmallVector<int, 16> Mask(SadElems, -1);
46606 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46607 Mask[j] = MaskEnd + j;
46608
46609 SDValue Shuffle =
46610 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46611 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46612 }
46613 }
46614
46615 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46616 // Return the lowest ExtractSizeInBits bits.
46617 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46618 SadVT.getSizeInBits() / ExtractSizeInBits);
46619 SAD = DAG.getBitcast(ResVT, SAD);
46620 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46621 Extract->getOperand(1));
46622}
46623
46624// If this extract is from a loaded vector value and will be used as an
46625// integer, that requires a potentially expensive XMM -> GPR transfer.
46626// Additionally, if we can convert to a scalar integer load, that will likely
46627// be folded into a subsequent integer op.
46628// Note: SrcVec might not have a VecVT type, but it must be the same size.
46629// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46630// to a single-use of the loaded vector. For the reasons above, we
46631// expect this to be profitable even if it creates an extra load.
46632static SDValue
46634 const SDLoc &dl, SelectionDAG &DAG,
46636 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46637 "Only EXTRACT_VECTOR_ELT supported so far");
46638
46639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46640 EVT VT = N->getValueType(0);
46641
46642 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46643 return Use->getOpcode() == ISD::STORE ||
46644 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46645 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46646 });
46647
46648 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46649 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46650 VecVT.getVectorElementType() == VT &&
46651 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46652 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46653 SDValue NewPtr = TLI.getVectorElementPointer(
46654 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46655 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46656 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46657 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46658 SDValue Load =
46659 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46660 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46661 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46662 return Load;
46663 }
46664
46665 return SDValue();
46666}
46667
46668// Attempt to peek through a target shuffle and extract the scalar from the
46669// source.
46672 const X86Subtarget &Subtarget) {
46673 if (DCI.isBeforeLegalizeOps())
46674 return SDValue();
46675
46676 SDLoc dl(N);
46677 SDValue Src = N->getOperand(0);
46678 SDValue Idx = N->getOperand(1);
46679
46680 EVT VT = N->getValueType(0);
46681 EVT SrcVT = Src.getValueType();
46682 EVT SrcSVT = SrcVT.getVectorElementType();
46683 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46684 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46685
46686 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46687 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46688 return SDValue();
46689
46690 const APInt &IdxC = N->getConstantOperandAPInt(1);
46691 if (IdxC.uge(NumSrcElts))
46692 return SDValue();
46693
46694 SDValue SrcBC = peekThroughBitcasts(Src);
46695
46696 // Handle extract(bitcast(broadcast(scalar_value))).
46697 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46698 SDValue SrcOp = SrcBC.getOperand(0);
46699 EVT SrcOpVT = SrcOp.getValueType();
46700 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46701 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46702 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46703 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46704 // TODO support non-zero offsets.
46705 if (Offset == 0) {
46706 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46707 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46708 return SrcOp;
46709 }
46710 }
46711 }
46712
46713 // If we're extracting a single element from a broadcast load and there are
46714 // no other users, just create a single load.
46716 SrcBC.hasOneUse()) {
46717 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46718 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46719 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46720 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46721 SDValue Load =
46722 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46723 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46724 MemIntr->getMemOperand()->getFlags());
46725 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46726 return Load;
46727 }
46728 }
46729
46730 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46731 // TODO: Move to DAGCombine?
46732 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46733 SrcBC.getValueType().isInteger() &&
46734 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46735 SrcBC.getScalarValueSizeInBits() ==
46736 SrcBC.getOperand(0).getValueSizeInBits()) {
46737 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46738 if (IdxC.ult(Scale)) {
46739 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46740 SDValue Scl = SrcBC.getOperand(0);
46741 EVT SclVT = Scl.getValueType();
46742 if (Offset) {
46743 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46744 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46745 }
46746 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46747 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46748 return Scl;
46749 }
46750 }
46751
46752 // Handle extract(truncate(x)) for 0'th index.
46753 // TODO: Treat this as a faux shuffle?
46754 // TODO: When can we use this for general indices?
46755 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46756 (SrcVT.getSizeInBits() % 128) == 0) {
46757 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46758 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46759 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46760 Idx);
46761 }
46762
46763 // We can only legally extract other elements from 128-bit vectors and in
46764 // certain circumstances, depending on SSE-level.
46765 // TODO: Investigate float/double extraction if it will be just stored.
46766 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46767 unsigned Idx) {
46768 EVT VecSVT = VecVT.getScalarType();
46769 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46770 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46771 VecSVT == MVT::i64)) {
46772 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46773 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46774 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46775 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46776 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46777 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46778 Idx &= (NumEltsPerLane - 1);
46779 }
46780 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46781 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46783 DAG.getBitcast(VecVT, Vec),
46784 DAG.getVectorIdxConstant(Idx, dl));
46785 }
46786 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46787 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46788 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46789 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46790 DAG.getTargetConstant(Idx, dl, MVT::i8));
46791 }
46792 return SDValue();
46793 };
46794
46795 // Resolve the target shuffle inputs and mask.
46798 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46799 return SDValue();
46800
46801 // Shuffle inputs must be the same size as the result.
46802 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46803 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46804 }))
46805 return SDValue();
46806
46807 // Attempt to narrow/widen the shuffle mask to the correct size.
46808 if (Mask.size() != NumSrcElts) {
46809 if ((NumSrcElts % Mask.size()) == 0) {
46810 SmallVector<int, 16> ScaledMask;
46811 int Scale = NumSrcElts / Mask.size();
46812 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46813 Mask = std::move(ScaledMask);
46814 } else if ((Mask.size() % NumSrcElts) == 0) {
46815 // Simplify Mask based on demanded element.
46816 int ExtractIdx = (int)IdxC.getZExtValue();
46817 int Scale = Mask.size() / NumSrcElts;
46818 int Lo = Scale * ExtractIdx;
46819 int Hi = Scale * (ExtractIdx + 1);
46820 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46821 if (i < Lo || Hi <= i)
46822 Mask[i] = SM_SentinelUndef;
46823
46824 SmallVector<int, 16> WidenedMask;
46825 while (Mask.size() > NumSrcElts &&
46826 canWidenShuffleElements(Mask, WidenedMask))
46827 Mask = std::move(WidenedMask);
46828 }
46829 }
46830
46831 // If narrowing/widening failed, see if we can extract+zero-extend.
46832 int ExtractIdx;
46833 EVT ExtractVT;
46834 if (Mask.size() == NumSrcElts) {
46835 ExtractIdx = Mask[IdxC.getZExtValue()];
46836 ExtractVT = SrcVT;
46837 } else {
46838 unsigned Scale = Mask.size() / NumSrcElts;
46839 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46840 return SDValue();
46841 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46842 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46843 return SDValue();
46844 ExtractIdx = Mask[ScaledIdx];
46845 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46846 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46847 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46848 "Failed to widen vector type");
46849 }
46850
46851 // If the shuffle source element is undef/zero then we can just accept it.
46852 if (ExtractIdx == SM_SentinelUndef)
46853 return DAG.getUNDEF(VT);
46854
46855 if (ExtractIdx == SM_SentinelZero)
46856 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46857 : DAG.getConstant(0, dl, VT);
46858
46859 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46860 ExtractIdx = ExtractIdx % Mask.size();
46861 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46862 return DAG.getZExtOrTrunc(V, dl, VT);
46863
46864 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46866 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46867 return V;
46868
46869 return SDValue();
46870}
46871
46872/// Extracting a scalar FP value from vector element 0 is free, so extract each
46873/// operand first, then perform the math as a scalar op.
46875 const X86Subtarget &Subtarget,
46877 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46878 SDValue Vec = ExtElt->getOperand(0);
46879 SDValue Index = ExtElt->getOperand(1);
46880 EVT VT = ExtElt->getValueType(0);
46881 EVT VecVT = Vec.getValueType();
46882
46883 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46884 // non-zero element because the shuffle+scalar op will be cheaper?
46885 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46886 return SDValue();
46887
46888 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46889 // extract, the condition code), so deal with those as a special-case.
46890 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46891 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46892 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46893 return SDValue();
46894
46895 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46896 SDLoc DL(ExtElt);
46897 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46898 Vec.getOperand(0), Index);
46899 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46900 Vec.getOperand(1), Index);
46901 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46902 }
46903
46904 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46905 VT != MVT::f64)
46906 return SDValue();
46907
46908 // Vector FP selects don't fit the pattern of FP math ops (because the
46909 // condition has a different type and we have to change the opcode), so deal
46910 // with those here.
46911 // FIXME: This is restricted to pre type legalization. If we loosen this we
46912 // need to convert vector bool to a scalar bool.
46913 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46914 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46915 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46916 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46917 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46918 SDLoc DL(ExtElt);
46921 Vec.getOperand(0), Index);
46922 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46923 Vec.getOperand(1), Index);
46924 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46925 Vec.getOperand(2), Index);
46926 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46927 }
46928
46929 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46930 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46931 // missed load folding and fma+fneg combining.
46932 switch (Vec.getOpcode()) {
46933 case ISD::FMA: // Begin 3 operands
46934 case ISD::FMAD:
46935 case ISD::FADD: // Begin 2 operands
46936 case ISD::FSUB:
46937 case ISD::FMUL:
46938 case ISD::FDIV:
46939 case ISD::FREM:
46940 case ISD::FCOPYSIGN:
46941 case ISD::FMINNUM:
46942 case ISD::FMAXNUM:
46943 case ISD::FMINNUM_IEEE:
46944 case ISD::FMAXNUM_IEEE:
46945 case ISD::FMAXIMUM:
46946 case ISD::FMINIMUM:
46947 case ISD::FMAXIMUMNUM:
46948 case ISD::FMINIMUMNUM:
46949 case X86ISD::FMAX:
46950 case X86ISD::FMIN:
46951 case ISD::FABS: // Begin 1 operand
46952 case ISD::FSQRT:
46953 case ISD::FRINT:
46954 case ISD::FCEIL:
46955 case ISD::FTRUNC:
46956 case ISD::FNEARBYINT:
46957 case ISD::FROUNDEVEN:
46958 case ISD::FROUND:
46959 case ISD::FFLOOR:
46960 case X86ISD::FRCP:
46961 case X86ISD::FRSQRT: {
46962 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46963 SDLoc DL(ExtElt);
46965 for (SDValue Op : Vec->ops())
46966 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46967 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46968 }
46969 default:
46970 return SDValue();
46971 }
46972 llvm_unreachable("All opcodes should return within switch");
46973}
46974
46975/// Try to convert a vector reduction sequence composed of binops and shuffles
46976/// into horizontal ops.
46978 const X86Subtarget &Subtarget) {
46979 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46980
46981 // We need at least SSE2 to anything here.
46982 if (!Subtarget.hasSSE2())
46983 return SDValue();
46984
46986 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46987 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46988 if (!Rdx)
46989 return SDValue();
46990
46991 SDValue Index = ExtElt->getOperand(1);
46992 assert(isNullConstant(Index) &&
46993 "Reduction doesn't end in an extract from index 0");
46994
46995 EVT VT = ExtElt->getValueType(0);
46996 EVT VecVT = Rdx.getValueType();
46997 if (VecVT.getScalarType() != VT)
46998 return SDValue();
46999
47000 SDLoc DL(ExtElt);
47001 unsigned NumElts = VecVT.getVectorNumElements();
47002 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
47003
47004 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
47005 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
47006 if (V.getValueType() == MVT::v4i8) {
47007 if (ZeroExtend && Subtarget.hasSSE41()) {
47008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
47009 DAG.getConstant(0, DL, MVT::v4i32),
47010 DAG.getBitcast(MVT::i32, V),
47011 DAG.getVectorIdxConstant(0, DL));
47012 return DAG.getBitcast(MVT::v16i8, V);
47013 }
47014 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47015 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47016 : DAG.getUNDEF(MVT::v4i8));
47017 }
47018 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47019 DAG.getUNDEF(MVT::v8i8));
47020 };
47021
47022 // vXi8 mul reduction - promote to vXi16 mul reduction.
47023 if (Opc == ISD::MUL) {
47024 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47025 return SDValue();
47026 if (VecVT.getSizeInBits() >= 128) {
47027 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47028 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47029 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47030 Lo = DAG.getBitcast(WideVT, Lo);
47031 Hi = DAG.getBitcast(WideVT, Hi);
47032 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47033 while (Rdx.getValueSizeInBits() > 128) {
47034 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47035 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47036 }
47037 } else {
47038 Rdx = WidenToV16I8(Rdx, false);
47039 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47040 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47041 }
47042 if (NumElts >= 8)
47043 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47044 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47045 {4, 5, 6, 7, -1, -1, -1, -1}));
47046 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47047 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47048 {2, 3, -1, -1, -1, -1, -1, -1}));
47049 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47050 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47051 {1, -1, -1, -1, -1, -1, -1, -1}));
47052 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47054 }
47055
47056 // vXi8 add reduction - sub 128-bit vector.
47057 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47058 Rdx = WidenToV16I8(Rdx, true);
47059 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47060 DAG.getConstant(0, DL, MVT::v16i8));
47061 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47062 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47063 }
47064
47065 // Must be a >=128-bit vector with pow2 elements.
47066 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47067 return SDValue();
47068
47069 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47070 if (VT == MVT::i8) {
47071 while (Rdx.getValueSizeInBits() > 128) {
47072 SDValue Lo, Hi;
47073 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47074 VecVT = Lo.getValueType();
47075 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47076 }
47077 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47078
47080 MVT::v16i8, DL, Rdx, Rdx,
47081 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47082 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47083 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47084 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47085 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47086 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47087 }
47088
47089 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47090 // If the source vector values are 0-255, then we can use PSADBW to
47091 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47092 // TODO: See if its worth avoiding vXi16/i32 truncations?
47093 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47094 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47095 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47096 Subtarget.hasAVX512())) {
47097 if (Rdx.getValueType() == MVT::v8i16) {
47098 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47099 DAG.getUNDEF(MVT::v8i16));
47100 } else {
47101 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47102 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47103 if (ByteVT.getSizeInBits() < 128)
47104 Rdx = WidenToV16I8(Rdx, true);
47105 }
47106
47107 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47108 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47110 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47111 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47112 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47113 };
47114 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47115 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47116
47117 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47118 while (Rdx.getValueSizeInBits() > 128) {
47119 SDValue Lo, Hi;
47120 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47121 VecVT = Lo.getValueType();
47122 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47123 }
47124 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47125
47126 if (NumElts > 8) {
47127 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47128 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47129 }
47130
47131 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47132 Rdx = DAG.getBitcast(VecVT, Rdx);
47133 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47134 }
47135
47136 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47137 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47138 return SDValue();
47139
47140 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47141
47142 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47143 // across the whole vector, so we need an extract + hop preliminary stage.
47144 // This is the only step where the operands of the hop are not the same value.
47145 // TODO: We could extend this to handle 512-bit or even longer vectors.
47146 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47147 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47148 unsigned NumElts = VecVT.getVectorNumElements();
47149 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47150 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47151 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47152 VecVT = Rdx.getValueType();
47153 }
47154 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47155 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47156 return SDValue();
47157
47158 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47159 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47160 for (unsigned i = 0; i != ReductionSteps; ++i)
47161 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47162
47163 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47164}
47165
47166/// Detect vector gather/scatter index generation and convert it from being a
47167/// bunch of shuffles and extracts into a somewhat faster sequence.
47168/// For i686, the best sequence is apparently storing the value and loading
47169/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47172 const X86Subtarget &Subtarget) {
47173 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47174 return NewOp;
47175
47176 SDValue InputVector = N->getOperand(0);
47177 SDValue EltIdx = N->getOperand(1);
47178 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47179
47180 EVT SrcVT = InputVector.getValueType();
47181 EVT VT = N->getValueType(0);
47182 SDLoc dl(InputVector);
47183 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47184 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47185 unsigned NumEltBits = VT.getScalarSizeInBits();
47186 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47187
47188 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47189 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47190
47191 // Integer Constant Folding.
47192 if (CIdx && VT.isInteger()) {
47193 APInt UndefVecElts;
47194 SmallVector<APInt, 16> EltBits;
47195 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47196 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47197 EltBits, /*AllowWholeUndefs*/ true,
47198 /*AllowPartialUndefs*/ false)) {
47199 uint64_t Idx = CIdx->getZExtValue();
47200 if (UndefVecElts[Idx])
47201 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47202 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47203 }
47204
47205 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47206 // Improves lowering of bool masks on rust which splits them into byte array.
47207 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47208 SDValue Src = peekThroughBitcasts(InputVector);
47209 if (Src.getValueType().getScalarType() == MVT::i1 &&
47210 TLI.isTypeLegal(Src.getValueType())) {
47211 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47212 SDValue Sub = DAG.getNode(
47213 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47214 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47215 return DAG.getBitcast(VT, Sub);
47216 }
47217 }
47218 }
47219
47220 if (IsPextr) {
47221 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47222 DCI))
47223 return SDValue(N, 0);
47224
47225 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47226 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47227 InputVector.getOpcode() == X86ISD::PINSRW) &&
47228 InputVector.getOperand(2) == EltIdx) {
47229 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47230 "Vector type mismatch");
47231 SDValue Scl = InputVector.getOperand(1);
47232 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47233 return DAG.getZExtOrTrunc(Scl, dl, VT);
47234 }
47235
47236 // TODO - Remove this once we can handle the implicit zero-extension of
47237 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47238 // combineBasicSADPattern.
47239 return SDValue();
47240 }
47241
47242 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47243 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47244 InputVector.getOpcode() == ISD::BITCAST &&
47245 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47246 isNullConstant(EltIdx) && InputVector.hasOneUse())
47247 return DAG.getBitcast(VT, InputVector);
47248
47249 // Detect mmx to i32 conversion through a v2i32 elt extract.
47250 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47251 InputVector.getOpcode() == ISD::BITCAST &&
47252 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47253 isNullConstant(EltIdx) && InputVector.hasOneUse())
47254 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47255 InputVector.getOperand(0));
47256
47257 // Check whether this extract is the root of a sum of absolute differences
47258 // pattern. This has to be done here because we really want it to happen
47259 // pre-legalization,
47260 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47261 return SAD;
47262
47263 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47264 return VPDPBUSD;
47265
47266 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47267 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47268 return Cmp;
47269
47270 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47271 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47272 return MinMax;
47273
47274 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47275 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47276 return V;
47277
47278 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47279 return V;
47280
47281 if (CIdx)
47283 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47284 dl, DAG, DCI))
47285 return V;
47286
47287 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47288 // and then testing the relevant element.
47289 //
47290 // Note that we only combine extracts on the *same* result number, i.e.
47291 // t0 = merge_values a0, a1, a2, a3
47292 // i1 = extract_vector_elt t0, Constant:i64<2>
47293 // i1 = extract_vector_elt t0, Constant:i64<3>
47294 // but not
47295 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47296 // since the latter would need its own MOVMSK.
47297 if (SrcVT.getScalarType() == MVT::i1) {
47298 bool IsVar = !CIdx;
47299 SmallVector<SDNode *, 16> BoolExtracts;
47300 unsigned ResNo = InputVector.getResNo();
47301 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47302 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47303 Use->getOperand(0).getResNo() == ResNo &&
47304 Use->getValueType(0) == MVT::i1) {
47305 BoolExtracts.push_back(Use);
47306 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47307 return true;
47308 }
47309 return false;
47310 };
47311 // TODO: Can we drop the oneuse check for constant extracts?
47312 if (all_of(InputVector->users(), IsBoolExtract) &&
47313 (IsVar || BoolExtracts.size() > 1)) {
47314 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47315 if (SDValue BC =
47316 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47317 for (SDNode *Use : BoolExtracts) {
47318 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47319 // Mask = 1 << MaskIdx
47320 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47321 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47322 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47323 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47324 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47325 DCI.CombineTo(Use, Res);
47326 }
47327 return SDValue(N, 0);
47328 }
47329 }
47330 }
47331
47332 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47333 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47334 SDValue TruncSrc = InputVector.getOperand(0);
47335 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47336 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47337 SDValue NewExt =
47338 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47339 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47340 }
47341 }
47342
47343 return SDValue();
47344}
47345
47346// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47347// This is more or less the reverse of combineBitcastvxi1.
47349 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47350 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47351 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47352 Opcode != ISD::ANY_EXTEND)
47353 return SDValue();
47354 if (!DCI.isBeforeLegalizeOps())
47355 return SDValue();
47356 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47357 return SDValue();
47358
47359 EVT SVT = VT.getScalarType();
47360 EVT InSVT = N0.getValueType().getScalarType();
47361 unsigned EltSizeInBits = SVT.getSizeInBits();
47362
47363 // Input type must be extending a bool vector (bit-casted from a scalar
47364 // integer) to legal integer types.
47365 if (!VT.isVector())
47366 return SDValue();
47367 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47368 return SDValue();
47369 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47370 return SDValue();
47371
47372 SDValue N00 = N0.getOperand(0);
47373 EVT SclVT = N00.getValueType();
47374 if (!SclVT.isScalarInteger())
47375 return SDValue();
47376
47377 SDValue Vec;
47378 SmallVector<int> ShuffleMask;
47379 unsigned NumElts = VT.getVectorNumElements();
47380 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47381
47382 // Broadcast the scalar integer to the vector elements.
47383 if (NumElts > EltSizeInBits) {
47384 // If the scalar integer is greater than the vector element size, then we
47385 // must split it down into sub-sections for broadcasting. For example:
47386 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47387 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47388 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47389 unsigned Scale = NumElts / EltSizeInBits;
47390 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47391 bool UseBroadcast = Subtarget.hasInt256() &&
47392 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47393 Vec = UseBroadcast
47394 ? DAG.getSplat(BroadcastVT, DL, N00)
47395 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47396 Vec = DAG.getBitcast(VT, Vec);
47397
47398 for (unsigned i = 0; i != Scale; ++i) {
47399 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47400 ShuffleMask.append(EltSizeInBits, i + Offset);
47401 }
47402 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47403 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47404 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47405 // If we have register broadcast instructions, use the scalar size as the
47406 // element type for the shuffle. Then cast to the wider element type. The
47407 // widened bits won't be used, and this might allow the use of a broadcast
47408 // load.
47409 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47410 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47411 (NumElts * EltSizeInBits) / NumElts);
47412 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47413 } else {
47414 // For smaller scalar integers, we can simply any-extend it to the vector
47415 // element size (we don't care about the upper bits) and broadcast it to all
47416 // elements.
47417 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47418 }
47419
47420 // Now, mask the relevant bit in each element.
47422 for (unsigned i = 0; i != NumElts; ++i) {
47423 int BitIdx = (i % EltSizeInBits);
47424 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47425 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47426 }
47427 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47428 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47429
47430 // Compare against the bitmask and extend the result.
47431 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47432 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47433 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47434
47435 // For SEXT, this is now done, otherwise shift the result down for
47436 // zero-extension.
47437 if (Opcode == ISD::SIGN_EXTEND)
47438 return Vec;
47439 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47440 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47441}
47442
47443/// If both arms of a vector select are concatenated vectors, split the select,
47444/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47445/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47446/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47448 const X86Subtarget &Subtarget) {
47449 unsigned Opcode = N->getOpcode();
47450 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47451 return SDValue();
47452
47453 // TODO: Split 512-bit vectors too?
47454 EVT VT = N->getValueType(0);
47455 if (!VT.is256BitVector())
47456 return SDValue();
47457
47458 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47459 SDValue Cond = N->getOperand(0);
47460 SDValue TVal = N->getOperand(1);
47461 SDValue FVal = N->getOperand(2);
47462 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47463 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47464 return SDValue();
47465
47466 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47468 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47469 };
47470 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47471 /*CheckBWI*/ false);
47472}
47473
47475 const SDLoc &DL) {
47476 SDValue Cond = N->getOperand(0);
47477 SDValue LHS = N->getOperand(1);
47478 SDValue RHS = N->getOperand(2);
47479
47480 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47481 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47482 if (!TrueC || !FalseC)
47483 return SDValue();
47484
47485 // Don't do this for crazy integer types.
47486 EVT VT = N->getValueType(0);
47487 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47488 return SDValue();
47489
47490 // We're going to use the condition bit in math or logic ops. We could allow
47491 // this with a wider condition value (post-legalization it becomes an i8),
47492 // but if nothing is creating selects that late, it doesn't matter.
47493 if (Cond.getValueType() != MVT::i1)
47494 return SDValue();
47495
47496 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47497 // 3, 5, or 9 with i32/i64, so those get transformed too.
47498 // TODO: For constants that overflow or do not differ by power-of-2 or small
47499 // multiplier, convert to 'and' + 'add'.
47500 const APInt &TrueVal = TrueC->getAPIntValue();
47501 const APInt &FalseVal = FalseC->getAPIntValue();
47502
47503 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47504 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47505 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47506 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47507 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47508 return SDValue();
47509 }
47510
47511 bool OV;
47512 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47513 if (OV)
47514 return SDValue();
47515
47516 APInt AbsDiff = Diff.abs();
47517 if (AbsDiff.isPowerOf2() ||
47518 ((VT == MVT::i32 || VT == MVT::i64) &&
47519 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47520
47521 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47522 // of the condition can usually be folded into a compare predicate, but even
47523 // without that, the sequence should be cheaper than a CMOV alternative.
47524 if (TrueVal.slt(FalseVal)) {
47525 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47526 std::swap(TrueC, FalseC);
47527 }
47528
47529 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47530 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47531
47532 // Multiply condition by the difference if non-one.
47533 if (!AbsDiff.isOne())
47534 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47535
47536 // Add the base if non-zero.
47537 if (!FalseC->isZero())
47538 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47539
47540 return R;
47541 }
47542
47543 return SDValue();
47544}
47545
47546/// If this is a *dynamic* select (non-constant condition) and we can match
47547/// this node with one of the variable blend instructions, restructure the
47548/// condition so that blends can use the high (sign) bit of each element.
47549/// This function will also call SimplifyDemandedBits on already created
47550/// BLENDV to perform additional simplifications.
47552 const SDLoc &DL,
47554 const X86Subtarget &Subtarget) {
47555 SDValue Cond = N->getOperand(0);
47556 if ((N->getOpcode() != ISD::VSELECT &&
47557 N->getOpcode() != X86ISD::BLENDV) ||
47559 return SDValue();
47560
47561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47562 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47563 EVT VT = N->getValueType(0);
47564
47565 // We can only handle the cases where VSELECT is directly legal on the
47566 // subtarget. We custom lower VSELECT nodes with constant conditions and
47567 // this makes it hard to see whether a dynamic VSELECT will correctly
47568 // lower, so we both check the operation's status and explicitly handle the
47569 // cases where a *dynamic* blend will fail even though a constant-condition
47570 // blend could be custom lowered.
47571 // FIXME: We should find a better way to handle this class of problems.
47572 // Potentially, we should combine constant-condition vselect nodes
47573 // pre-legalization into shuffles and not mark as many types as custom
47574 // lowered.
47576 return SDValue();
47577 // FIXME: We don't support i16-element blends currently. We could and
47578 // should support them by making *all* the bits in the condition be set
47579 // rather than just the high bit and using an i8-element blend.
47580 if (VT.getVectorElementType() == MVT::i16)
47581 return SDValue();
47582 // Dynamic blending was only available from SSE4.1 onward.
47583 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47584 return SDValue();
47585 // Byte blends are only available in AVX2
47586 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47587 return SDValue();
47588 // There are no 512-bit blend instructions that use sign bits.
47589 if (VT.is512BitVector())
47590 return SDValue();
47591
47592 // Don't optimize before the condition has been transformed to a legal type
47593 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47595 return SDValue();
47596
47597 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47598 for (SDUse &Use : Cond->uses())
47599 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47600 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47601 Use.getOperandNo() != 0)
47602 return false;
47603
47604 return true;
47605 };
47606
47608
47609 if (OnlyUsedAsSelectCond(Cond)) {
47610 KnownBits Known;
47612 !DCI.isBeforeLegalizeOps());
47613 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47614 return SDValue();
47615
47616 // If we changed the computation somewhere in the DAG, this change will
47617 // affect all users of Cond. Update all the nodes so that we do not use
47618 // the generic VSELECT anymore. Otherwise, we may perform wrong
47619 // optimizations as we messed with the actual expectation for the vector
47620 // boolean values.
47621 for (SDNode *U : Cond->users()) {
47622 if (U->getOpcode() == X86ISD::BLENDV)
47623 continue;
47624
47625 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47626 Cond, U->getOperand(1), U->getOperand(2));
47627 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47628 DCI.AddToWorklist(U);
47629 }
47630 DCI.CommitTargetLoweringOpt(TLO);
47631 return SDValue(N, 0);
47632 }
47633
47634 // Otherwise we can still at least try to simplify multiple use bits.
47636 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47637 N->getOperand(1), N->getOperand(2));
47638
47639 return SDValue();
47640}
47641
47642// Try to match:
47643// (or (and (M, (sub 0, X)), (pandn M, X)))
47644// which is a special case of:
47645// (select M, (sub 0, X), X)
47646// Per:
47647// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47648// We know that, if fNegate is 0 or 1:
47649// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47650//
47651// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47652// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47653// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47654// This lets us transform our vselect to:
47655// (add (xor X, M), (and M, 1))
47656// And further to:
47657// (sub (xor X, M), M)
47659 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47660 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47661 using namespace SDPatternMatch;
47662 EVT MaskVT = Mask.getValueType();
47663 assert(MaskVT.isInteger() &&
47664 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47665 "Mask must be zero/all-bits");
47666
47667 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47669 return SDValue();
47670
47671 SDValue V;
47672 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47674 return SDValue();
47675
47676 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47677 SDValue SubOp2 = Mask;
47678
47679 // If the negate was on the false side of the select, then
47680 // the operands of the SUB need to be swapped. PR 27251.
47681 // This is because the pattern being matched above is
47682 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47683 // but if the pattern matched was
47684 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47685 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47686 // pattern also needs to be a negation of the replacement pattern above.
47687 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47688 // sub accomplishes the negation of the replacement pattern.
47689 if (V == Y)
47690 std::swap(SubOp1, SubOp2);
47691
47692 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47693 return DAG.getBitcast(VT, Res);
47694}
47695
47697 const X86Subtarget &Subtarget) {
47698 using namespace SDPatternMatch;
47699 if (!Subtarget.hasAVX512())
47700 return SDValue();
47701
47702 ISD::CondCode CC;
47703 SDValue Cond, X, Y, LHS, RHS;
47706 m_CondCode(CC)))),
47707 m_Value(LHS), m_Value(RHS))))
47708 return SDValue();
47709
47710 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47711 !canCombineAsMaskOperation(RHS, Subtarget))
47712 return SDValue();
47713
47714 // Commute LHS and RHS to create opportunity to select mask instruction.
47715 // (vselect M, L, R) -> (vselect ~M, R, L)
47716 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47717 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47718 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47719}
47720
47721/// Do target-specific dag combines on SELECT and VSELECT nodes.
47724 const X86Subtarget &Subtarget) {
47725 SDLoc DL(N);
47726 SDValue Cond = N->getOperand(0);
47727 SDValue LHS = N->getOperand(1);
47728 SDValue RHS = N->getOperand(2);
47729
47730 // Try simplification again because we use this function to optimize
47731 // BLENDV nodes that are not handled by the generic combiner.
47732 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47733 return V;
47734
47735 // When avx512 is available the lhs operand of select instruction can be
47736 // folded with mask instruction, while the rhs operand can't. Commute the
47737 // lhs and rhs of the select instruction to create the opportunity of
47738 // folding.
47739 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47740 return V;
47741
47742 EVT VT = LHS.getValueType();
47743 EVT CondVT = Cond.getValueType();
47744 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47745 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47746
47747 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47748 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47749 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47750 if (CondVT.isVector() && CondVT.isInteger() &&
47751 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47752 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47755 DL, DAG, Subtarget))
47756 return V;
47757
47758 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47759 SmallVector<int, 64> CondMask;
47760 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47761 N->getOpcode() == X86ISD::BLENDV)) {
47762 // Convert vselects with constant condition into shuffles.
47763 if (DCI.isBeforeLegalizeOps())
47764 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47765
47766 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47767 // by forcing the unselected elements to zero.
47768 // TODO: Can we handle more shuffles with this?
47769 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47770 SmallVector<SDValue, 1> LHSOps, RHSOps;
47771 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47774 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47775 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47776 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47777 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47778 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47779 assert(ByteMask.size() == LHSMask.size() &&
47780 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47781 for (auto [I, M] : enumerate(ByteMask)) {
47782 // getConstVector sets negative shuffle mask values as undef, so
47783 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47784 if (M < (int)ByteMask.size()) {
47785 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47786 RHSMask[I] = 0x80;
47787 } else {
47788 LHSMask[I] = 0x80;
47789 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47790 }
47791 }
47792 MVT ByteVT = LHSShuf.getSimpleValueType();
47793 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47794 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47795 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47796 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47797 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47798 }
47799 }
47800
47801 // Attempt to combine as shuffle.
47802 SDValue Op(N, 0);
47803 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47804 return Res;
47805 }
47806 }
47807
47808 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47809 // instructions match the semantics of the common C idiom x<y?x:y but not
47810 // x<=y?x:y, because of how they handle negative zero (which can be
47811 // ignored in unsafe-math mode).
47812 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47813 if ((Cond.getOpcode() == ISD::SETCC ||
47814 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47815 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47816 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47817 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47818 (Subtarget.hasSSE2() ||
47819 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47820 bool IsStrict = Cond->isStrictFPOpcode();
47821 ISD::CondCode CC =
47822 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47823 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47824 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47825
47826 unsigned Opcode = 0;
47827 // Check for x CC y ? x : y.
47828 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47829 switch (CC) {
47830 default: break;
47831 case ISD::SETULT:
47832 // Converting this to a min would handle NaNs incorrectly, and swapping
47833 // the operands would cause it to handle comparisons between positive
47834 // and negative zero incorrectly.
47835 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47837 !(DAG.isKnownNeverZeroFloat(LHS) ||
47839 break;
47840 std::swap(LHS, RHS);
47841 }
47842 Opcode = X86ISD::FMIN;
47843 break;
47844 case ISD::SETOLE:
47845 // Converting this to a min would handle comparisons between positive
47846 // and negative zero incorrectly.
47849 break;
47850 Opcode = X86ISD::FMIN;
47851 break;
47852 case ISD::SETULE:
47853 // Converting this to a min would handle both negative zeros and NaNs
47854 // incorrectly, but we can swap the operands to fix both.
47855 std::swap(LHS, RHS);
47856 [[fallthrough]];
47857 case ISD::SETOLT:
47858 case ISD::SETLT:
47859 case ISD::SETLE:
47860 Opcode = X86ISD::FMIN;
47861 break;
47862
47863 case ISD::SETOGE:
47864 // Converting this to a max would handle comparisons between positive
47865 // and negative zero incorrectly.
47868 break;
47869 Opcode = X86ISD::FMAX;
47870 break;
47871 case ISD::SETUGT:
47872 // Converting this to a max would handle NaNs incorrectly, and swapping
47873 // the operands would cause it to handle comparisons between positive
47874 // and negative zero incorrectly.
47875 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47877 !(DAG.isKnownNeverZeroFloat(LHS) ||
47879 break;
47880 std::swap(LHS, RHS);
47881 }
47882 Opcode = X86ISD::FMAX;
47883 break;
47884 case ISD::SETUGE:
47885 // Converting this to a max would handle both negative zeros and NaNs
47886 // incorrectly, but we can swap the operands to fix both.
47887 std::swap(LHS, RHS);
47888 [[fallthrough]];
47889 case ISD::SETOGT:
47890 case ISD::SETGT:
47891 case ISD::SETGE:
47892 Opcode = X86ISD::FMAX;
47893 break;
47894 }
47895 // Check for x CC y ? y : x -- a min/max with reversed arms.
47896 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47897 switch (CC) {
47898 default: break;
47899 case ISD::SETOGE:
47900 // Converting this to a min would handle comparisons between positive
47901 // and negative zero incorrectly, and swapping the operands would
47902 // cause it to handle NaNs incorrectly.
47904 !(DAG.isKnownNeverZeroFloat(LHS) ||
47905 DAG.isKnownNeverZeroFloat(RHS))) {
47906 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47907 break;
47908 std::swap(LHS, RHS);
47909 }
47910 Opcode = X86ISD::FMIN;
47911 break;
47912 case ISD::SETUGT:
47913 // Converting this to a min would handle NaNs incorrectly.
47914 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47915 break;
47916 Opcode = X86ISD::FMIN;
47917 break;
47918 case ISD::SETUGE:
47919 // Converting this to a min would handle both negative zeros and NaNs
47920 // incorrectly, but we can swap the operands to fix both.
47921 std::swap(LHS, RHS);
47922 [[fallthrough]];
47923 case ISD::SETOGT:
47924 case ISD::SETGT:
47925 case ISD::SETGE:
47926 Opcode = X86ISD::FMIN;
47927 break;
47928
47929 case ISD::SETULT:
47930 // Converting this to a max would handle NaNs incorrectly.
47931 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47932 break;
47933 Opcode = X86ISD::FMAX;
47934 break;
47935 case ISD::SETOLE:
47936 // Converting this to a max would handle comparisons between positive
47937 // and negative zero incorrectly, and swapping the operands would
47938 // cause it to handle NaNs incorrectly.
47940 !DAG.isKnownNeverZeroFloat(LHS) &&
47941 !DAG.isKnownNeverZeroFloat(RHS)) {
47942 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47943 break;
47944 std::swap(LHS, RHS);
47945 }
47946 Opcode = X86ISD::FMAX;
47947 break;
47948 case ISD::SETULE:
47949 // Converting this to a max would handle both negative zeros and NaNs
47950 // incorrectly, but we can swap the operands to fix both.
47951 std::swap(LHS, RHS);
47952 [[fallthrough]];
47953 case ISD::SETOLT:
47954 case ISD::SETLT:
47955 case ISD::SETLE:
47956 Opcode = X86ISD::FMAX;
47957 break;
47958 }
47959 }
47960
47961 if (Opcode) {
47962 if (IsStrict) {
47963 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47965 DL, {N->getValueType(0), MVT::Other},
47966 {Cond.getOperand(0), LHS, RHS});
47967 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47968 return Ret;
47969 }
47970 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47971 }
47972 }
47973
47974 // Some mask scalar intrinsics rely on checking if only one bit is set
47975 // and implement it in C code like this:
47976 // A[0] = (U & 1) ? A[0] : W[0];
47977 // This creates some redundant instructions that break pattern matching.
47978 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47979 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47980 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47981 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47982 SDValue AndNode = Cond.getOperand(0);
47983 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47984 isNullConstant(Cond.getOperand(1)) &&
47985 isOneConstant(AndNode.getOperand(1))) {
47986 // LHS and RHS swapped due to
47987 // setcc outputting 1 when AND resulted in 0 and vice versa.
47988 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47989 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47990 }
47991 }
47992
47993 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47994 // lowering on KNL. In this case we convert it to
47995 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47996 // The same situation all vectors of i8 and i16 without BWI.
47997 // Make sure we extend these even before type legalization gets a chance to
47998 // split wide vectors.
47999 // Since SKX these selects have a proper lowering.
48000 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
48001 CondVT.getVectorElementType() == MVT::i1 &&
48002 (VT.getVectorElementType() == MVT::i8 ||
48003 VT.getVectorElementType() == MVT::i16)) {
48004 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
48005 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
48006 }
48007
48008 // AVX512 - Extend select to merge with target shuffle.
48009 // select(mask, extract_subvector(shuffle(x)), y) -->
48010 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
48011 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
48012 if (Subtarget.hasAVX512() && CondVT.isVector() &&
48013 CondVT.getVectorElementType() == MVT::i1) {
48014 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48015 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48016 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48017 isNullConstant(Op.getOperand(1)) &&
48018 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48019 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48020 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48021 ISD::isBuildVectorAllZeros(Alt.getNode()));
48022 };
48023
48024 bool SelectableLHS = SelectableOp(LHS, RHS);
48025 bool SelectableRHS = SelectableOp(RHS, LHS);
48026 if (SelectableLHS || SelectableRHS) {
48027 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48028 : RHS.getOperand(0).getValueType();
48029 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48030 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48031 VT.getSizeInBits());
48032 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48033 VT.getSizeInBits());
48034 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48035 DAG.getUNDEF(SrcCondVT), Cond,
48036 DAG.getVectorIdxConstant(0, DL));
48037 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48038 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48039 }
48040 }
48041
48042 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48043 return V;
48044
48045 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48046 Cond.hasOneUse()) {
48047 EVT CondVT = Cond.getValueType();
48048 SDValue Cond0 = Cond.getOperand(0);
48049 SDValue Cond1 = Cond.getOperand(1);
48050 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48051
48052 // Canonicalize min/max:
48053 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48054 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48055 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48056 // the need for an extra compare against zero. e.g.
48057 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48058 // subl %esi, %edi
48059 // testl %edi, %edi
48060 // movl $0, %eax
48061 // cmovgl %edi, %eax
48062 // =>
48063 // xorl %eax, %eax
48064 // subl %esi, $edi
48065 // cmovsl %eax, %edi
48066 //
48067 // We can also canonicalize
48068 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48069 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48070 // This allows the use of a test instruction for the compare.
48071 if (LHS == Cond0 && RHS == Cond1) {
48072 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48073 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48075 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48076 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48077 }
48078 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48079 ISD::CondCode NewCC = ISD::SETUGE;
48080 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48081 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48082 }
48083 }
48084
48085 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48086 // fold eq + gt/lt nested selects into ge/le selects
48087 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48088 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48089 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48090 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48091 // .. etc ..
48092 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48093 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48094 SDValue InnerSetCC = RHS.getOperand(0);
48095 ISD::CondCode InnerCC =
48096 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48097 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48098 Cond0 == InnerSetCC.getOperand(0) &&
48099 Cond1 == InnerSetCC.getOperand(1)) {
48100 ISD::CondCode NewCC;
48101 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48102 // clang-format off
48103 case ISD::SETGT: NewCC = ISD::SETGE; break;
48104 case ISD::SETLT: NewCC = ISD::SETLE; break;
48105 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48106 case ISD::SETULT: NewCC = ISD::SETULE; break;
48107 default: NewCC = ISD::SETCC_INVALID; break;
48108 // clang-format on
48109 }
48110 if (NewCC != ISD::SETCC_INVALID) {
48111 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48112 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48113 }
48114 }
48115 }
48116 }
48117
48118 // Check if the first operand is all zeros and Cond type is vXi1.
48119 // If this an avx512 target we can improve the use of zero masking by
48120 // swapping the operands and inverting the condition.
48121 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48122 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48123 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48124 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48125 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48126 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48127 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48128 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48129 }
48130
48131 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48132 // get split by legalization.
48133 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48134 CondVT.getVectorElementType() == MVT::i1 &&
48135 TLI.isTypeLegal(VT.getScalarType())) {
48136 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48138 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48139 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48140 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48141 }
48142 }
48143
48144 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48145 // with out-of-bounds clamping.
48146
48147 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48148 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48149 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48150 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48151 // exceeding bitwidth-1.
48152 if (N->getOpcode() == ISD::VSELECT) {
48153 using namespace llvm::SDPatternMatch;
48154 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48155 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48156 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48157 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48159 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48162 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48163 : X86ISD::VSHLV,
48164 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48165 }
48166 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48167 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48168 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48169 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48171 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48174 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48175 : X86ISD::VSHLV,
48176 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48177 }
48178 }
48179
48180 // Early exit check
48181 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48182 return SDValue();
48183
48184 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48185 return V;
48186
48187 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48188 return V;
48189
48190 // select(~Cond, X, Y) -> select(Cond, Y, X)
48191 if (CondVT.getScalarType() != MVT::i1) {
48192 if (SDValue CondNot = IsNOT(Cond, DAG))
48193 return DAG.getNode(N->getOpcode(), DL, VT,
48194 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48195
48196 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48197 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48198 Cond.getOperand(0).getOpcode() == ISD::AND &&
48199 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48200 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48201 Cond.getScalarValueSizeInBits(),
48202 /*AllowUndefs=*/true) &&
48203 Cond.hasOneUse()) {
48204 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48205 Cond.getOperand(0).getOperand(1));
48206 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48207 }
48208
48209 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48210 // signbit.
48211 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48212 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48213 Cond.hasOneUse()) {
48214 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48215 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48216 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48217 }
48218 }
48219
48220 // Try to optimize vXi1 selects if both operands are either all constants or
48221 // bitcasts from scalar integer type. In that case we can convert the operands
48222 // to integer and use an integer select which will be converted to a CMOV.
48223 // We need to take a little bit of care to avoid creating an i64 type after
48224 // type legalization.
48225 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48226 VT.getVectorElementType() == MVT::i1 &&
48227 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48229 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48230 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48231 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48232
48233 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48234 LHS.getOperand(0).getValueType() == IntVT)) &&
48235 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48236 RHS.getOperand(0).getValueType() == IntVT))) {
48237 if (LHSIsConst)
48239 else
48240 LHS = LHS.getOperand(0);
48241
48242 if (RHSIsConst)
48244 else
48245 RHS = RHS.getOperand(0);
48246
48247 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48248 return DAG.getBitcast(VT, Select);
48249 }
48250 }
48251 }
48252
48253 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48254 // single bits, then invert the predicate and swap the select operands.
48255 // This can lower using a vector shift bit-hack rather than mask and compare.
48256 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48257 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48258 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48259 Cond.getOperand(0).getOpcode() == ISD::AND &&
48260 isNullOrNullSplat(Cond.getOperand(1)) &&
48261 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48262 Cond.getOperand(0).getValueType() == VT) {
48263 // The 'and' mask must be composed of power-of-2 constants.
48264 SDValue And = Cond.getOperand(0);
48265 auto *C = isConstOrConstSplat(And.getOperand(1));
48266 if (C && C->getAPIntValue().isPowerOf2()) {
48267 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48268 SDValue NotCond =
48269 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48270 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48271 }
48272
48273 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48274 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48275 // 16-bit lacks a proper blendv.
48276 unsigned EltBitWidth = VT.getScalarSizeInBits();
48277 bool CanShiftBlend =
48278 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48279 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48280 (Subtarget.hasXOP()));
48281 if (CanShiftBlend &&
48282 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48283 return C->getAPIntValue().isPowerOf2();
48284 })) {
48285 // Create a left-shift constant to get the mask bits over to the sign-bit.
48286 SDValue Mask = And.getOperand(1);
48287 SmallVector<int, 32> ShlVals;
48288 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48289 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48290 ShlVals.push_back(EltBitWidth - 1 -
48291 MaskVal->getAPIntValue().exactLogBase2());
48292 }
48293 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48294 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48295 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48296 SDValue NewCond =
48297 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48298 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48299 }
48300 }
48301
48302 return SDValue();
48303}
48304
48305/// Combine:
48306/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48307/// to:
48308/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48309/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48310/// Note that this is only legal for some op/cc combinations.
48312 SelectionDAG &DAG,
48313 const X86Subtarget &Subtarget) {
48314 // This combine only operates on CMP-like nodes.
48315 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48316 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48317 return SDValue();
48318
48319 // Can't replace the cmp if it has more uses than the one we're looking at.
48320 // FIXME: We would like to be able to handle this, but would need to make sure
48321 // all uses were updated.
48322 if (!Cmp.hasOneUse())
48323 return SDValue();
48324
48325 // This only applies to variations of the common case:
48326 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48327 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48328 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48329 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48330 // Using the proper condcodes (see below), overflow is checked for.
48331
48332 // FIXME: We can generalize both constraints:
48333 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48334 // - LHS != 1
48335 // if the result is compared.
48336
48337 SDValue CmpLHS = Cmp.getOperand(0);
48338 SDValue CmpRHS = Cmp.getOperand(1);
48339 EVT CmpVT = CmpLHS.getValueType();
48340
48341 if (!CmpLHS.hasOneUse())
48342 return SDValue();
48343
48344 unsigned Opc = CmpLHS.getOpcode();
48345 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48346 return SDValue();
48347
48348 SDValue OpRHS = CmpLHS.getOperand(2);
48349 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48350 if (!OpRHSC)
48351 return SDValue();
48352
48353 APInt Addend = OpRHSC->getAPIntValue();
48354 if (Opc == ISD::ATOMIC_LOAD_SUB)
48355 Addend = -Addend;
48356
48357 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48358 if (!CmpRHSC)
48359 return SDValue();
48360
48361 APInt Comparison = CmpRHSC->getAPIntValue();
48362 APInt NegAddend = -Addend;
48363
48364 // See if we can adjust the CC to make the comparison match the negated
48365 // addend.
48366 if (Comparison != NegAddend) {
48367 APInt IncComparison = Comparison + 1;
48368 if (IncComparison == NegAddend) {
48369 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48370 Comparison = IncComparison;
48371 CC = X86::COND_AE;
48372 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48373 Comparison = IncComparison;
48374 CC = X86::COND_L;
48375 }
48376 }
48377 APInt DecComparison = Comparison - 1;
48378 if (DecComparison == NegAddend) {
48379 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48380 Comparison = DecComparison;
48381 CC = X86::COND_A;
48382 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48383 Comparison = DecComparison;
48384 CC = X86::COND_LE;
48385 }
48386 }
48387 }
48388
48389 // If the addend is the negation of the comparison value, then we can do
48390 // a full comparison by emitting the atomic arithmetic as a locked sub.
48391 if (Comparison == NegAddend) {
48392 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48393 // atomic sub.
48394 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48395 auto AtomicSub = DAG.getAtomic(
48396 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48397 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48398 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48399 AN->getMemOperand());
48400 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48401 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48402 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48403 return LockOp;
48404 }
48405
48406 // We can handle comparisons with zero in a number of cases by manipulating
48407 // the CC used.
48408 if (!Comparison.isZero())
48409 return SDValue();
48410
48411 if (CC == X86::COND_S && Addend == 1)
48412 CC = X86::COND_LE;
48413 else if (CC == X86::COND_NS && Addend == 1)
48414 CC = X86::COND_G;
48415 else if (CC == X86::COND_G && Addend == -1)
48416 CC = X86::COND_GE;
48417 else if (CC == X86::COND_LE && Addend == -1)
48418 CC = X86::COND_L;
48419 else
48420 return SDValue();
48421
48422 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48423 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48424 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48425 return LockOp;
48426}
48427
48428// Check whether we're just testing the signbit, and whether we can simplify
48429// this by tracking where the signbit came from.
48431 SelectionDAG &DAG) {
48432 if (CC != X86::COND_S && CC != X86::COND_NS)
48433 return SDValue();
48434
48435 if (!Cmp.hasOneUse())
48436 return SDValue();
48437
48438 SDValue Src;
48439 if (Cmp.getOpcode() == X86ISD::CMP) {
48440 // CMP(X,0) -> signbit test
48441 if (!isNullConstant(Cmp.getOperand(1)))
48442 return SDValue();
48443 Src = Cmp.getOperand(0);
48444 // Peek through a SRA node as we just need the signbit.
48445 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48446 // TODO: Use SimplifyDemandedBits instead of just SRA?
48447 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48448 return SDValue();
48449 Src = Src.getOperand(0);
48450 } else if (Cmp.getOpcode() == X86ISD::OR) {
48451 // OR(X,Y) -> see if only one operand contributes to the signbit.
48452 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48453 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48454 Src = Cmp.getOperand(1);
48455 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48456 Src = Cmp.getOperand(0);
48457 else
48458 return SDValue();
48459 } else {
48460 return SDValue();
48461 }
48462
48463 // Replace with a TEST on the MSB.
48464 SDLoc DL(Cmp);
48465 MVT SrcVT = Src.getSimpleValueType();
48466 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48467
48468 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48469 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48470 if (Src.getOpcode() == ISD::SHL) {
48471 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48472 Src = Src.getOperand(0);
48473 BitMask.lshrInPlace(*ShiftAmt);
48474 }
48475 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48476 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48477 Src = Src.getOperand(0);
48478 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48479 }
48480
48481 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48482 DAG.getConstant(BitMask, DL, SrcVT));
48483 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48484 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48485 DAG.getConstant(0, DL, SrcVT));
48486}
48487
48488// Check whether a boolean test is testing a boolean value generated by
48489// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48490// code.
48491//
48492// Simplify the following patterns:
48493// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48494// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48495// to (Op EFLAGS Cond)
48496//
48497// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48498// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48499// to (Op EFLAGS !Cond)
48500//
48501// where Op could be BRCOND or CMOV.
48502//
48504 // This combine only operates on CMP-like nodes.
48505 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48506 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48507 return SDValue();
48508
48509 // Quit if not used as a boolean value.
48510 if (CC != X86::COND_E && CC != X86::COND_NE)
48511 return SDValue();
48512
48513 // Check CMP operands. One of them should be 0 or 1 and the other should be
48514 // an SetCC or extended from it.
48515 SDValue Op1 = Cmp.getOperand(0);
48516 SDValue Op2 = Cmp.getOperand(1);
48517
48518 SDValue SetCC;
48519 const ConstantSDNode* C = nullptr;
48520 bool needOppositeCond = (CC == X86::COND_E);
48521 bool checkAgainstTrue = false; // Is it a comparison against 1?
48522
48523 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48524 SetCC = Op2;
48525 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48526 SetCC = Op1;
48527 else // Quit if all operands are not constants.
48528 return SDValue();
48529
48530 if (C->getZExtValue() == 1) {
48531 needOppositeCond = !needOppositeCond;
48532 checkAgainstTrue = true;
48533 } else if (C->getZExtValue() != 0)
48534 // Quit if the constant is neither 0 or 1.
48535 return SDValue();
48536
48537 bool truncatedToBoolWithAnd = false;
48538 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48539 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48540 SetCC.getOpcode() == ISD::TRUNCATE ||
48541 SetCC.getOpcode() == ISD::AND) {
48542 if (SetCC.getOpcode() == ISD::AND) {
48543 int OpIdx = -1;
48544 if (isOneConstant(SetCC.getOperand(0)))
48545 OpIdx = 1;
48546 if (isOneConstant(SetCC.getOperand(1)))
48547 OpIdx = 0;
48548 if (OpIdx < 0)
48549 break;
48550 SetCC = SetCC.getOperand(OpIdx);
48551 truncatedToBoolWithAnd = true;
48552 } else
48553 SetCC = SetCC.getOperand(0);
48554 }
48555
48556 switch (SetCC.getOpcode()) {
48558 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48559 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48560 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48561 // truncated to i1 using 'and'.
48562 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48563 break;
48565 "Invalid use of SETCC_CARRY!");
48566 [[fallthrough]];
48567 case X86ISD::SETCC:
48568 // Set the condition code or opposite one if necessary.
48569 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48570 if (needOppositeCond)
48572 return SetCC.getOperand(1);
48573 case X86ISD::CMOV: {
48574 // Check whether false/true value has canonical one, i.e. 0 or 1.
48577 // Quit if true value is not a constant.
48578 if (!TVal)
48579 return SDValue();
48580 // Quit if false value is not a constant.
48581 if (!FVal) {
48582 SDValue Op = SetCC.getOperand(0);
48583 // Skip 'zext' or 'trunc' node.
48584 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48585 Op.getOpcode() == ISD::TRUNCATE)
48586 Op = Op.getOperand(0);
48587 // A special case for rdrand/rdseed, where 0 is set if false cond is
48588 // found.
48589 if ((Op.getOpcode() != X86ISD::RDRAND &&
48590 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48591 return SDValue();
48592 }
48593 // Quit if false value is not the constant 0 or 1.
48594 bool FValIsFalse = true;
48595 if (FVal && FVal->getZExtValue() != 0) {
48596 if (FVal->getZExtValue() != 1)
48597 return SDValue();
48598 // If FVal is 1, opposite cond is needed.
48599 needOppositeCond = !needOppositeCond;
48600 FValIsFalse = false;
48601 }
48602 // Quit if TVal is not the constant opposite of FVal.
48603 if (FValIsFalse && TVal->getZExtValue() != 1)
48604 return SDValue();
48605 if (!FValIsFalse && TVal->getZExtValue() != 0)
48606 return SDValue();
48607 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48608 if (needOppositeCond)
48610 return SetCC.getOperand(3);
48611 }
48612 }
48613
48614 return SDValue();
48615}
48616
48617/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48618/// Match:
48619/// (X86or (X86setcc) (X86setcc))
48620/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48622 X86::CondCode &CC1, SDValue &Flags,
48623 bool &isAnd) {
48624 if (Cond->getOpcode() == X86ISD::CMP) {
48625 if (!isNullConstant(Cond->getOperand(1)))
48626 return false;
48627
48628 Cond = Cond->getOperand(0);
48629 }
48630
48631 isAnd = false;
48632
48633 SDValue SetCC0, SetCC1;
48634 switch (Cond->getOpcode()) {
48635 default: return false;
48636 case ISD::AND:
48637 case X86ISD::AND:
48638 isAnd = true;
48639 [[fallthrough]];
48640 case ISD::OR:
48641 case X86ISD::OR:
48642 SetCC0 = Cond->getOperand(0);
48643 SetCC1 = Cond->getOperand(1);
48644 break;
48645 };
48646
48647 // Make sure we have SETCC nodes, using the same flags value.
48648 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48649 SetCC1.getOpcode() != X86ISD::SETCC ||
48650 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48651 return false;
48652
48653 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48654 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48655 Flags = SetCC0->getOperand(1);
48656 return true;
48657}
48658
48659// When legalizing carry, we create carries via add X, -1
48660// If that comes from an actual carry, via setcc, we use the
48661// carry directly.
48663 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48664 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48665 bool FoundAndLSB = false;
48666 SDValue Carry = EFLAGS.getOperand(0);
48667 while (Carry.getOpcode() == ISD::TRUNCATE ||
48668 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48669 (Carry.getOpcode() == ISD::AND &&
48670 isOneConstant(Carry.getOperand(1)))) {
48671 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48672 Carry = Carry.getOperand(0);
48673 }
48674 if (Carry.getOpcode() == X86ISD::SETCC ||
48675 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48676 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48677 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48678 SDValue CarryOp1 = Carry.getOperand(1);
48679 if (CarryCC == X86::COND_B)
48680 return CarryOp1;
48681 if (CarryCC == X86::COND_A) {
48682 // Try to convert COND_A into COND_B in an attempt to facilitate
48683 // materializing "setb reg".
48684 //
48685 // Do not flip "e > c", where "c" is a constant, because Cmp
48686 // instruction cannot take an immediate as its first operand.
48687 //
48688 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48689 CarryOp1.getNode()->hasOneUse() &&
48690 CarryOp1.getValueType().isInteger() &&
48691 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48692 SDValue SubCommute =
48693 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48694 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48695 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48696 }
48697 }
48698 // If this is a check of the z flag of an add with 1, switch to the
48699 // C flag.
48700 if (CarryCC == X86::COND_E &&
48701 CarryOp1.getOpcode() == X86ISD::ADD &&
48702 isOneConstant(CarryOp1.getOperand(1)))
48703 return CarryOp1;
48704 } else if (FoundAndLSB) {
48705 SDLoc DL(Carry);
48706 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48707 if (Carry.getOpcode() == ISD::SRL) {
48708 BitNo = Carry.getOperand(1);
48709 Carry = Carry.getOperand(0);
48710 }
48711 return getBT(Carry, BitNo, DL, DAG);
48712 }
48713 }
48714 }
48715
48716 return SDValue();
48717}
48718
48719/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48720/// to avoid the inversion.
48722 SelectionDAG &DAG,
48723 const X86Subtarget &Subtarget) {
48724 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48725 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48726 EFLAGS.getOpcode() != X86ISD::TESTP)
48727 return SDValue();
48728
48729 // PTEST/TESTP sets EFLAGS as:
48730 // TESTZ: ZF = (Op0 & Op1) == 0
48731 // TESTC: CF = (~Op0 & Op1) == 0
48732 // TESTNZC: ZF == 0 && CF == 0
48733 MVT VT = EFLAGS.getSimpleValueType();
48734 SDValue Op0 = EFLAGS.getOperand(0);
48735 SDValue Op1 = EFLAGS.getOperand(1);
48736 MVT OpVT = Op0.getSimpleValueType();
48737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48738
48739 // TEST*(~X,Y) == TEST*(X,Y)
48740 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48741 X86::CondCode InvCC;
48742 switch (CC) {
48743 case X86::COND_B:
48744 // testc -> testz.
48745 InvCC = X86::COND_E;
48746 break;
48747 case X86::COND_AE:
48748 // !testc -> !testz.
48749 InvCC = X86::COND_NE;
48750 break;
48751 case X86::COND_E:
48752 // testz -> testc.
48753 InvCC = X86::COND_B;
48754 break;
48755 case X86::COND_NE:
48756 // !testz -> !testc.
48757 InvCC = X86::COND_AE;
48758 break;
48759 case X86::COND_A:
48760 case X86::COND_BE:
48761 // testnzc -> testnzc (no change).
48762 InvCC = CC;
48763 break;
48764 default:
48765 InvCC = X86::COND_INVALID;
48766 break;
48767 }
48768
48769 if (InvCC != X86::COND_INVALID) {
48770 CC = InvCC;
48771 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48772 DAG.getBitcast(OpVT, NotOp0), Op1);
48773 }
48774 }
48775
48776 if (CC == X86::COND_B || CC == X86::COND_AE) {
48777 // TESTC(X,~X) == TESTC(X,-1)
48778 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48779 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48780 SDLoc DL(EFLAGS);
48781 return DAG.getNode(
48782 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48783 DAG.getBitcast(OpVT,
48784 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48785 }
48786 }
48787 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48788 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48790 SDValue BC0 = peekThroughBitcasts(Op0);
48791 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48793 SDLoc DL(EFLAGS);
48794 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48795 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48796 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48797 }
48798 }
48799 }
48800
48801 if (CC == X86::COND_E || CC == X86::COND_NE) {
48802 // TESTZ(X,~Y) == TESTC(Y,X)
48803 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48804 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48805 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48806 DAG.getBitcast(OpVT, NotOp1), Op0);
48807 }
48808
48809 if (Op0 == Op1) {
48810 SDValue BC = peekThroughBitcasts(Op0);
48811 EVT BCVT = BC.getValueType();
48812
48813 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48814 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48815 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48816 DAG.getBitcast(OpVT, BC.getOperand(0)),
48817 DAG.getBitcast(OpVT, BC.getOperand(1)));
48818 }
48819
48820 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48821 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48822 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48823 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48824 DAG.getBitcast(OpVT, BC.getOperand(0)),
48825 DAG.getBitcast(OpVT, BC.getOperand(1)));
48826 }
48827
48828 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48829 // to more efficiently extract the sign bits and compare that.
48830 // TODO: Handle TESTC with comparison inversion.
48831 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48832 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48833 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48834 unsigned EltBits = BCVT.getScalarSizeInBits();
48835 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48836 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48837 APInt SignMask = APInt::getSignMask(EltBits);
48838 if (SDValue Res =
48839 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48840 // For vXi16 cases we need to use pmovmksb and extract every other
48841 // sign bit.
48842 SDLoc DL(EFLAGS);
48843 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48844 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48845 MVT FloatVT =
48846 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48847 Res = DAG.getBitcast(FloatVT, Res);
48848 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48849 } else if (EltBits == 16) {
48850 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48851 Res = DAG.getBitcast(MovmskVT, Res);
48852 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48853 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48854 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48855 } else {
48856 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48857 }
48858 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48859 DAG.getConstant(0, DL, MVT::i32));
48860 }
48861 }
48862 }
48863 }
48864
48865 // TESTZ(-1,X) == TESTZ(X,X)
48867 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48868
48869 // TESTZ(X,-1) == TESTZ(X,X)
48871 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48872
48873 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48874 // TODO: Add COND_NE handling?
48875 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48876 SDValue Src0 = peekThroughBitcasts(Op0);
48877 SDValue Src1 = peekThroughBitcasts(Op1);
48878 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48880 peekThroughBitcasts(Src0.getOperand(1)), true);
48882 peekThroughBitcasts(Src1.getOperand(1)), true);
48883 if (Src0 && Src1) {
48884 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48885 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48886 DAG.getBitcast(OpVT2, Src0),
48887 DAG.getBitcast(OpVT2, Src1));
48888 }
48889 }
48890 }
48891 }
48892
48893 return SDValue();
48894}
48895
48896// Attempt to simplify the MOVMSK input based on the comparison type.
48898 SelectionDAG &DAG,
48899 const X86Subtarget &Subtarget) {
48900 // Handle eq/ne against zero (any_of).
48901 // Handle eq/ne against -1 (all_of).
48902 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48903 return SDValue();
48904 if (EFLAGS.getValueType() != MVT::i32)
48905 return SDValue();
48906 unsigned CmpOpcode = EFLAGS.getOpcode();
48907 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48908 return SDValue();
48909 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48910 if (!CmpConstant)
48911 return SDValue();
48912 const APInt &CmpVal = CmpConstant->getAPIntValue();
48913
48914 SDValue CmpOp = EFLAGS.getOperand(0);
48915 unsigned CmpBits = CmpOp.getValueSizeInBits();
48916 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48917
48918 // Peek through any truncate.
48919 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48920 CmpOp = CmpOp.getOperand(0);
48921
48922 // Bail if we don't find a MOVMSK.
48923 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48924 return SDValue();
48925
48926 SDValue Vec = CmpOp.getOperand(0);
48927 MVT VecVT = Vec.getSimpleValueType();
48928 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48929 "Unexpected MOVMSK operand");
48930 unsigned NumElts = VecVT.getVectorNumElements();
48931 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48932
48933 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48934 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48935 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48936 if (!IsAnyOf && !IsAllOf)
48937 return SDValue();
48938
48939 // TODO: Check more combining cases for me.
48940 // Here we check the cmp use number to decide do combining or not.
48941 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48942 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48943 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48944
48945 // See if we can peek through to a vector with a wider element type, if the
48946 // signbits extend down to all the sub-elements as well.
48947 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48948 // potential SimplifyDemandedBits/Elts cases.
48949 // If we looked through a truncate that discard bits, we can't do this
48950 // transform.
48951 // FIXME: We could do this transform for truncates that discarded bits by
48952 // inserting an AND mask between the new MOVMSK and the CMP.
48953 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48954 SDValue BC = peekThroughBitcasts(Vec);
48955 MVT BCVT = BC.getSimpleValueType();
48956 unsigned BCNumElts = BCVT.getVectorNumElements();
48957 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48958 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48959 BCNumEltBits > NumEltBits &&
48960 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48961 SDLoc DL(EFLAGS);
48962 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48963 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48964 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48965 DAG.getConstant(CmpMask, DL, MVT::i32));
48966 }
48967 }
48968
48969 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48970 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48971 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48972 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48973 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48975 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48976 Ops.size() == 2) {
48977 SDLoc DL(EFLAGS);
48978 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48979 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48980 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48981 DAG.getBitcast(SubVT, Ops[0]),
48982 DAG.getBitcast(SubVT, Ops[1]));
48983 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48984 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48985 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48986 DAG.getConstant(CmpMask, DL, MVT::i32));
48987 }
48988 }
48989
48990 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48991 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48992 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48993 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48994 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48995 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48996 SDValue BC = peekThroughBitcasts(Vec);
48997 // Ensure MOVMSK was testing every signbit of BC.
48998 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48999 if (BC.getOpcode() == X86ISD::PCMPEQ) {
49000 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
49001 BC.getOperand(0), BC.getOperand(1));
49002 V = DAG.getBitcast(TestVT, V);
49003 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49004 }
49005 // Check for 256-bit split vector cases.
49006 if (BC.getOpcode() == ISD::AND &&
49007 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
49008 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
49009 SDValue LHS = BC.getOperand(0);
49010 SDValue RHS = BC.getOperand(1);
49011 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
49012 LHS.getOperand(0), LHS.getOperand(1));
49013 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49014 RHS.getOperand(0), RHS.getOperand(1));
49015 LHS = DAG.getBitcast(TestVT, LHS);
49016 RHS = DAG.getBitcast(TestVT, RHS);
49017 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49018 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49019 }
49020 }
49021 }
49022
49023 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49024 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49025 // sign bits prior to the comparison with zero unless we know that
49026 // the vXi16 splats the sign bit down to the lower i8 half.
49027 // TODO: Handle all_of patterns.
49028 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49029 SDValue VecOp0 = Vec.getOperand(0);
49030 SDValue VecOp1 = Vec.getOperand(1);
49031 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49032 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49033 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49034 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49035 SDLoc DL(EFLAGS);
49036 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49037 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49038 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49039 if (!SignExt0) {
49040 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49041 DAG.getConstant(0xAAAA, DL, MVT::i16));
49042 }
49043 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49044 DAG.getConstant(0, DL, MVT::i16));
49045 }
49046 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49047 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49048 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49049 (IsAnyOf || (SignExt0 && SignExt1))) {
49050 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49051 SDLoc DL(EFLAGS);
49052 SDValue Result = peekThroughBitcasts(Src);
49053 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49054 Result.getValueType().getVectorNumElements() <= NumElts) {
49055 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49056 Result.getOperand(0), Result.getOperand(1));
49057 V = DAG.getBitcast(MVT::v4i64, V);
49058 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49059 }
49060 Result = DAG.getBitcast(MVT::v32i8, Result);
49061 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49062 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49063 if (!SignExt0 || !SignExt1) {
49064 assert(IsAnyOf &&
49065 "Only perform v16i16 signmasks for any_of patterns");
49066 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49067 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49068 }
49069 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49070 DAG.getConstant(CmpMask, DL, MVT::i32));
49071 }
49072 }
49073 }
49074
49075 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49076 // Since we peek through a bitcast, we need to be careful if the base vector
49077 // type has smaller elements than the MOVMSK type. In that case, even if
49078 // all the elements are demanded by the shuffle mask, only the "high"
49079 // elements which have highbits that align with highbits in the MOVMSK vec
49080 // elements are actually demanded. A simplification of spurious operations
49081 // on the "low" elements take place during other simplifications.
49082 //
49083 // For example:
49084 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49085 // demanded, because we are swapping around the result can change.
49086 //
49087 // To address this, we check that we can scale the shuffle mask to MOVMSK
49088 // element width (this will ensure "high" elements match). Its slightly overly
49089 // conservative, but fine for an edge case fold.
49090 SmallVector<int, 32> ShuffleMask;
49091 SmallVector<SDValue, 2> ShuffleInputs;
49092 if (NumElts <= CmpBits &&
49093 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49094 ShuffleMask, DAG) &&
49095 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49096 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49097 canScaleShuffleElements(ShuffleMask, NumElts)) {
49098 SDLoc DL(EFLAGS);
49099 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49100 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49101 Result =
49102 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49103 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49104 }
49105
49106 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49107 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49108 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49109 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49110 // iff every element is referenced.
49111 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49112 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49113 (NumEltBits == 32 || NumEltBits == 64)) {
49114 SDLoc DL(EFLAGS);
49115 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49116 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49117 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49118 SDValue LHS = Vec;
49119 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49120 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49121 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49122 DAG.getBitcast(FloatVT, LHS),
49123 DAG.getBitcast(FloatVT, RHS));
49124 }
49125
49126 return SDValue();
49127}
49128
49129/// Optimize an EFLAGS definition used according to the condition code \p CC
49130/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49131/// uses of chain values.
49133 SelectionDAG &DAG,
49134 const X86Subtarget &Subtarget) {
49135 if (CC == X86::COND_B)
49136 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49137 return Flags;
49138
49139 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49140 return R;
49141
49142 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49143 return R;
49144
49145 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49146 return R;
49147
49148 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49149 return R;
49150
49151 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49152}
49153
49154/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49157 const X86Subtarget &Subtarget) {
49158 SDLoc DL(N);
49159 EVT VT = N->getValueType(0);
49160 SDValue FalseOp = N->getOperand(0);
49161 SDValue TrueOp = N->getOperand(1);
49162 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49163 SDValue Cond = N->getOperand(3);
49164
49165 // cmov X, X, ?, ? --> X
49166 if (TrueOp == FalseOp)
49167 return TrueOp;
49168
49169 // Try to simplify the EFLAGS and condition code operands.
49170 // We can't always do this as FCMOV only supports a subset of X86 cond.
49171 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49172 if (!(FalseOp.getValueType() == MVT::f80 ||
49173 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49174 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49175 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49176 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49177 Flags};
49178 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49179 }
49180 }
49181
49182 // If this is a select between two integer constants, try to do some
49183 // optimizations. Note that the operands are ordered the opposite of SELECT
49184 // operands.
49185 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49186 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49187 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49188 // larger than FalseC (the false value).
49189 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49191 std::swap(TrueC, FalseC);
49192 std::swap(TrueOp, FalseOp);
49193 }
49194
49195 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49196 // This is efficient for any integer data type (including i8/i16) and
49197 // shift amount.
49198 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49199 Cond = getSETCC(CC, Cond, DL, DAG);
49200
49201 // Zero extend the condition if needed.
49202 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49203
49204 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49205 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49206 DAG.getConstant(ShAmt, DL, MVT::i8));
49207 return Cond;
49208 }
49209
49210 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49211 // for any integer data type, including i8/i16.
49212 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49213 Cond = getSETCC(CC, Cond, DL, DAG);
49214
49215 // Zero extend the condition if needed.
49217 FalseC->getValueType(0), Cond);
49218 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49219 SDValue(FalseC, 0));
49220 return Cond;
49221 }
49222
49223 // Optimize cases that will turn into an LEA instruction. This requires
49224 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49225 if (VT == MVT::i32 || VT == MVT::i64) {
49226 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49227 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49228 "Implicit constant truncation");
49229
49230 bool isFastMultiplier = false;
49231 if (Diff.ult(10)) {
49232 switch (Diff.getZExtValue()) {
49233 default: break;
49234 case 1: // result = add base, cond
49235 case 2: // result = lea base( , cond*2)
49236 case 3: // result = lea base(cond, cond*2)
49237 case 4: // result = lea base( , cond*4)
49238 case 5: // result = lea base(cond, cond*4)
49239 case 8: // result = lea base( , cond*8)
49240 case 9: // result = lea base(cond, cond*8)
49241 isFastMultiplier = true;
49242 break;
49243 }
49244 }
49245
49246 if (isFastMultiplier) {
49247 Cond = getSETCC(CC, Cond, DL ,DAG);
49248 // Zero extend the condition if needed.
49249 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49250 Cond);
49251 // Scale the condition by the difference.
49252 if (Diff != 1)
49253 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49254 DAG.getConstant(Diff, DL, Cond.getValueType()));
49255
49256 // Add the base if non-zero.
49257 if (FalseC->getAPIntValue() != 0)
49258 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49259 SDValue(FalseC, 0));
49260 return Cond;
49261 }
49262 }
49263 }
49264 }
49265
49266 // Handle these cases:
49267 // (select (x != c), e, c) -> select (x != c), e, x),
49268 // (select (x == c), c, e) -> select (x == c), x, e)
49269 // where the c is an integer constant, and the "select" is the combination
49270 // of CMOV and CMP.
49271 //
49272 // The rationale for this change is that the conditional-move from a constant
49273 // needs two instructions, however, conditional-move from a register needs
49274 // only one instruction.
49275 //
49276 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49277 // some instruction-combining opportunities. This opt needs to be
49278 // postponed as late as possible.
49279 //
49280 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49281 // the DCI.xxxx conditions are provided to postpone the optimization as
49282 // late as possible.
49283
49284 ConstantSDNode *CmpAgainst = nullptr;
49285 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49286 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49287 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49288
49289 if (CC == X86::COND_NE &&
49290 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49292 std::swap(TrueOp, FalseOp);
49293 }
49294
49295 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49296 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49297 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49298 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49299 }
49300 }
49301 }
49302
49303 // Transform:
49304 //
49305 // (cmov 1 T (uge T 2))
49306 //
49307 // to:
49308 //
49309 // (adc T 0 (sub T 1))
49310 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49311 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49312 SDValue Cond0 = Cond.getOperand(0);
49313 if (Cond0.getOpcode() == ISD::TRUNCATE)
49314 Cond0 = Cond0.getOperand(0);
49315 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49316 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49317 EVT CondVT = Cond->getValueType(0);
49318 // Subtract 1 and generate a carry.
49319 SDValue NewSub =
49320 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49321 DAG.getConstant(1, DL, CondVT));
49322 SDValue EFLAGS(NewSub.getNode(), 1);
49323 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49324 DAG.getConstant(0, DL, VT), EFLAGS);
49325 }
49326 }
49327
49328 // Fold and/or of setcc's to double CMOV:
49329 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49330 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49331 //
49332 // This combine lets us generate:
49333 // cmovcc1 (jcc1 if we don't have CMOV)
49334 // cmovcc2 (same)
49335 // instead of:
49336 // setcc1
49337 // setcc2
49338 // and/or
49339 // cmovne (jne if we don't have CMOV)
49340 // When we can't use the CMOV instruction, it might increase branch
49341 // mispredicts.
49342 // When we can use CMOV, or when there is no mispredict, this improves
49343 // throughput and reduces register pressure.
49344 //
49345 if (CC == X86::COND_NE) {
49346 SDValue Flags;
49347 X86::CondCode CC0, CC1;
49348 bool isAndSetCC;
49349 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49350 if (isAndSetCC) {
49351 std::swap(FalseOp, TrueOp);
49354 }
49355
49356 SDValue LOps[] = {FalseOp, TrueOp,
49357 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49358 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49359 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49360 Flags};
49361 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49362 return CMOV;
49363 }
49364 }
49365
49366 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49367 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49368 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49369 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49370 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49371 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49372 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49373 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49374 SDValue Add = TrueOp;
49375 SDValue Const = FalseOp;
49376 // Canonicalize the condition code for easier matching and output.
49377 if (CC == X86::COND_E)
49378 std::swap(Add, Const);
49379
49380 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49381 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49382 Add.getResNo() == 0 && Add.hasOneUse() &&
49383 Add.getOperand(1) == Cond.getOperand(0)) {
49384 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49385 Add.getOperand(1));
49386 }
49387
49388 // We might have replaced the constant in the cmov with the LHS of the
49389 // compare. If so change it to the RHS of the compare.
49390 if (Const == Cond.getOperand(0))
49391 Const = Cond.getOperand(1);
49392
49393 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49394 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49395 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49396 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49397 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49398 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49399 // This should constant fold.
49400 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49401 SDValue CMov =
49402 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49403 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49404 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49405 }
49406 }
49407
49408 return SDValue();
49409}
49410
49411/// Different mul shrinking modes.
49413
49415 EVT VT = N->getOperand(0).getValueType();
49416 if (VT.getScalarSizeInBits() != 32)
49417 return false;
49418
49419 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49420 unsigned SignBits[2] = {1, 1};
49421 bool IsPositive[2] = {false, false};
49422 for (unsigned i = 0; i < 2; i++) {
49423 SDValue Opd = N->getOperand(i);
49424
49425 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49426 IsPositive[i] = DAG.SignBitIsZero(Opd);
49427 }
49428
49429 bool AllPositive = IsPositive[0] && IsPositive[1];
49430 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49431 // When ranges are from -128 ~ 127, use MULS8 mode.
49432 if (MinSignBits >= 25)
49434 // When ranges are from 0 ~ 255, use MULU8 mode.
49435 else if (AllPositive && MinSignBits >= 24)
49437 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49438 else if (MinSignBits >= 17)
49440 // When ranges are from 0 ~ 65535, use MULU16 mode.
49441 else if (AllPositive && MinSignBits >= 16)
49443 else
49444 return false;
49445 return true;
49446}
49447
49448/// When the operands of vector mul are extended from smaller size values,
49449/// like i8 and i16, the type of mul may be shrinked to generate more
49450/// efficient code. Two typical patterns are handled:
49451/// Pattern1:
49452/// %2 = sext/zext <N x i8> %1 to <N x i32>
49453/// %4 = sext/zext <N x i8> %3 to <N x i32>
49454// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49455/// %5 = mul <N x i32> %2, %4
49456///
49457/// Pattern2:
49458/// %2 = zext/sext <N x i16> %1 to <N x i32>
49459/// %4 = zext/sext <N x i16> %3 to <N x i32>
49460/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49461/// %5 = mul <N x i32> %2, %4
49462///
49463/// There are four mul shrinking modes:
49464/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49465/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49466/// generate pmullw+sext32 for it (MULS8 mode).
49467/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49468/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49469/// generate pmullw+zext32 for it (MULU8 mode).
49470/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49471/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49472/// generate pmullw+pmulhw for it (MULS16 mode).
49473/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49474/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49475/// generate pmullw+pmulhuw for it (MULU16 mode).
49477 const X86Subtarget &Subtarget) {
49478 // Check for legality
49479 // pmullw/pmulhw are not supported by SSE.
49480 if (!Subtarget.hasSSE2())
49481 return SDValue();
49482
49483 // Check for profitability
49484 // pmulld is supported since SSE41. It is better to use pmulld
49485 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49486 // the expansion.
49487 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49488 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49489 return SDValue();
49490
49492 if (!canReduceVMulWidth(N, DAG, Mode))
49493 return SDValue();
49494
49495 SDValue N0 = N->getOperand(0);
49496 SDValue N1 = N->getOperand(1);
49497 EVT VT = N->getOperand(0).getValueType();
49498 unsigned NumElts = VT.getVectorNumElements();
49499 if ((NumElts % 2) != 0)
49500 return SDValue();
49501
49502 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49503
49504 // Shrink the operands of mul.
49505 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49506 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49507
49508 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49509 // lower part is needed.
49510 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49514 DL, VT, MulLo);
49515
49516 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49517 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49518 // the higher part is also needed.
49519 SDValue MulHi =
49521 ReducedVT, NewN0, NewN1);
49522
49523 // Repack the lower part and higher part result of mul into a wider
49524 // result.
49525 // Generate shuffle functioning as punpcklwd.
49526 SmallVector<int, 16> ShuffleMask(NumElts);
49527 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49528 ShuffleMask[2 * i] = i;
49529 ShuffleMask[2 * i + 1] = i + NumElts;
49530 }
49531 SDValue ResLo =
49532 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49533 ResLo = DAG.getBitcast(ResVT, ResLo);
49534 // Generate shuffle functioning as punpckhwd.
49535 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49536 ShuffleMask[2 * i] = i + NumElts / 2;
49537 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49538 }
49539 SDValue ResHi =
49540 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49541 ResHi = DAG.getBitcast(ResVT, ResHi);
49542 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49543}
49544
49546 EVT VT, const SDLoc &DL) {
49547
49548 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49549 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49550 DAG.getConstant(Mult, DL, VT));
49551 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49552 DAG.getConstant(Shift, DL, MVT::i8));
49553 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49554 N->getOperand(0));
49555 return Result;
49556 };
49557
49558 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49559 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49560 DAG.getConstant(Mul1, DL, VT));
49561 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49562 DAG.getConstant(Mul2, DL, VT));
49563 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49564 N->getOperand(0));
49565 return Result;
49566 };
49567
49568 switch (MulAmt) {
49569 default:
49570 break;
49571 case 11:
49572 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49573 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49574 case 21:
49575 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49576 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49577 case 41:
49578 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49579 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49580 case 22:
49581 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49582 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49583 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49584 case 19:
49585 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49586 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49587 case 37:
49588 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49589 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49590 case 73:
49591 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49592 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49593 case 13:
49594 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49595 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49596 case 23:
49597 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49598 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49599 case 26:
49600 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49601 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49602 case 28:
49603 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49604 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49605 case 29:
49606 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49607 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49608 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49609 }
49610
49611 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49612 // by a single LEA.
49613 // First check if this a sum of two power of 2s because that's easy. Then
49614 // count how many zeros are up to the first bit.
49615 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49616 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49617 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49618 if (ScaleShift >= 1 && ScaleShift < 4) {
49619 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49620 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49621 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49622 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49623 DAG.getConstant(ScaleShift, DL, MVT::i8));
49624 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49625 }
49626 }
49627
49628 return SDValue();
49629}
49630
49631// If the upper 17 bits of either element are zero and the other element are
49632// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49633// PMULLD, except on KNL.
49635 SelectionDAG &DAG,
49636 const X86Subtarget &Subtarget) {
49637 if (!Subtarget.hasSSE2())
49638 return SDValue();
49639
49640 if (Subtarget.isPMADDWDSlow())
49641 return SDValue();
49642
49643 EVT VT = N->getValueType(0);
49644
49645 // Only support vXi32 vectors.
49646 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49647 return SDValue();
49648
49649 // Make sure the type is legal or can split/widen to a legal type.
49650 // With AVX512 but without BWI, we would need to split v32i16.
49651 unsigned NumElts = VT.getVectorNumElements();
49652 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49653 return SDValue();
49654
49655 // With AVX512 but without BWI, we would need to split v32i16.
49656 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49657 return SDValue();
49658
49659 SDValue N0 = N->getOperand(0);
49660 SDValue N1 = N->getOperand(1);
49661
49662 // If we are zero/sign extending two steps without SSE4.1, its better to
49663 // reduce the vmul width instead.
49664 if (!Subtarget.hasSSE41() &&
49665 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49666 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49667 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49668 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49669 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49670 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49671 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49672 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49673 return SDValue();
49674
49675 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49676 // the vmul width instead.
49677 if (!Subtarget.hasSSE41() &&
49678 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49679 N0.getOperand(0).getValueSizeInBits() > 128) &&
49680 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49681 N1.getOperand(0).getValueSizeInBits() > 128))
49682 return SDValue();
49683
49684 // Sign bits must extend down to the lowest i16.
49685 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49686 DAG.ComputeMaxSignificantBits(N0) > 16)
49687 return SDValue();
49688
49689 // At least one of the elements must be zero in the upper 17 bits, or can be
49690 // safely made zero without altering the final result.
49691 auto GetZeroableOp = [&](SDValue Op) {
49692 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49693 if (DAG.MaskedValueIsZero(Op, Mask17))
49694 return Op;
49695 // Mask off upper 16-bits of sign-extended constants.
49697 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49698 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49699 SDValue Src = Op.getOperand(0);
49700 // Convert sext(vXi16) to zext(vXi16).
49701 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49702 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49703 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49704 // which will expand the extension.
49705 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49706 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49707 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49708 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49709 }
49710 }
49711 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49712 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49713 N->isOnlyUserOf(Op.getNode())) {
49714 SDValue Src = Op.getOperand(0);
49715 if (Src.getScalarValueSizeInBits() == 16)
49716 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49717 }
49718 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49719 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49720 N->isOnlyUserOf(Op.getNode())) {
49721 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49722 Op.getOperand(1));
49723 }
49724 return SDValue();
49725 };
49726 SDValue ZeroN0 = GetZeroableOp(N0);
49727 SDValue ZeroN1 = GetZeroableOp(N1);
49728 if (!ZeroN0 && !ZeroN1)
49729 return SDValue();
49730 N0 = ZeroN0 ? ZeroN0 : N0;
49731 N1 = ZeroN1 ? ZeroN1 : N1;
49732
49733 // Use SplitOpsAndApply to handle AVX splitting.
49734 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49736 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49737 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49738 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49739 DAG.getBitcast(OpVT, Ops[0]),
49740 DAG.getBitcast(OpVT, Ops[1]));
49741 };
49742 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49743}
49744
49746 const X86Subtarget &Subtarget) {
49747 if (!Subtarget.hasSSE2())
49748 return SDValue();
49749
49750 EVT VT = N->getValueType(0);
49751
49752 // Only support vXi64 vectors.
49753 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49754 VT.getVectorNumElements() < 2 ||
49756 return SDValue();
49757
49758 SDValue N0 = N->getOperand(0);
49759 SDValue N1 = N->getOperand(1);
49760
49761 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49762 // 32-bits. We can lower with this if the sign bits stretch that far.
49763 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49764 DAG.ComputeNumSignBits(N1) > 32) {
49765 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49767 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49768 };
49769 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49770 /*CheckBWI*/ false);
49771 }
49772
49773 // If the upper bits are zero we can use a single pmuludq.
49774 APInt Mask = APInt::getHighBitsSet(64, 32);
49775 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49776 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49778 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49779 };
49780 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49781 /*CheckBWI*/ false);
49782 }
49783
49784 return SDValue();
49785}
49786
49789 const X86Subtarget &Subtarget) {
49790 EVT VT = N->getValueType(0);
49791 SDLoc DL(N);
49792
49793 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49794 return V;
49795
49796 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49797 return V;
49798
49799 if (DCI.isBeforeLegalize() && VT.isVector())
49800 return reduceVMULWidth(N, DL, DAG, Subtarget);
49801
49802 if (VT != MVT::i64 && VT != MVT::i32 &&
49803 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49804 return SDValue();
49805
49806 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49807 if (!Known1.isConstant())
49808 return SDValue();
49809
49810 const APInt &C = Known1.getConstant();
49811 if (C.isZero())
49812 return DAG.getConstant(0, DL, VT);
49813
49814 if (C.isAllOnes())
49815 return DAG.getNegative(N->getOperand(0), DL, VT);
49816
49817 if (isPowerOf2_64(C.getZExtValue()))
49818 return SDValue();
49819
49820 // Optimize a single multiply with constant into two operations in order to
49821 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49823 return SDValue();
49824
49825 // An imul is usually smaller than the alternative sequence.
49827 return SDValue();
49828
49829 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49830 return SDValue();
49831
49832 int64_t SignMulAmt = C.getSExtValue();
49833 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49834 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49835
49836 SDValue NewMul = SDValue();
49837 if (VT == MVT::i64 || VT == MVT::i32) {
49838 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49839 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49840 DAG.getConstant(AbsMulAmt, DL, VT));
49841 if (SignMulAmt < 0)
49842 NewMul = DAG.getNegative(NewMul, DL, VT);
49843
49844 return NewMul;
49845 }
49846
49847 uint64_t MulAmt1 = 0;
49848 uint64_t MulAmt2 = 0;
49849 if ((AbsMulAmt % 9) == 0) {
49850 MulAmt1 = 9;
49851 MulAmt2 = AbsMulAmt / 9;
49852 } else if ((AbsMulAmt % 5) == 0) {
49853 MulAmt1 = 5;
49854 MulAmt2 = AbsMulAmt / 5;
49855 } else if ((AbsMulAmt % 3) == 0) {
49856 MulAmt1 = 3;
49857 MulAmt2 = AbsMulAmt / 3;
49858 }
49859
49860 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49861 if (MulAmt2 &&
49862 (isPowerOf2_64(MulAmt2) ||
49863 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49864
49865 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49866 N->user_begin()->getOpcode() == ISD::ADD))
49867 // If second multiplifer is pow2, issue it first. We want the multiply
49868 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49869 // use is an add. Only do this for positive multiply amounts since the
49870 // negate would prevent it from being used as an address mode anyway.
49871 std::swap(MulAmt1, MulAmt2);
49872
49873 if (isPowerOf2_64(MulAmt1))
49874 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49875 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49876 else
49877 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49878 DAG.getConstant(MulAmt1, DL, VT));
49879
49880 if (isPowerOf2_64(MulAmt2))
49881 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49882 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49883 else
49884 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49885 DAG.getConstant(MulAmt2, DL, VT));
49886
49887 // Negate the result.
49888 if (SignMulAmt < 0)
49889 NewMul = DAG.getNegative(NewMul, DL, VT);
49890 } else if (!Subtarget.slowLEA())
49891 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49892 }
49893 if (!NewMul) {
49894 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49895 if (isPowerOf2_64(AbsMulAmt - 1)) {
49896 // (mul x, 2^N + 1) => (add (shl x, N), x)
49897 NewMul = DAG.getNode(
49898 ISD::ADD, DL, VT, N->getOperand(0),
49899 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49900 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49901 if (SignMulAmt < 0)
49902 NewMul = DAG.getNegative(NewMul, DL, VT);
49903 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49904 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49905 NewMul =
49906 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49907 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49908 // To negate, reverse the operands of the subtract.
49909 if (SignMulAmt < 0)
49910 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49911 else
49912 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49913 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49914 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49915 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49916 NewMul =
49917 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49918 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49919 NewMul = DAG.getNode(
49920 ISD::ADD, DL, VT, NewMul,
49921 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49922 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49923 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49924 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49925 NewMul =
49926 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49927 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49928 NewMul = DAG.getNode(
49929 ISD::SUB, DL, VT, NewMul,
49930 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49931 } else if (SignMulAmt >= 0 && VT.isVector() &&
49932 Subtarget.fastImmVectorShift()) {
49933 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49934 uint64_t ShiftAmt1;
49935 std::optional<unsigned> Opc;
49936 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49937 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49938 Opc = ISD::ADD;
49939 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49940 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49941 Opc = ISD::SUB;
49942 }
49943
49944 if (Opc) {
49945 SDValue Shift1 =
49946 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49947 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49948 SDValue Shift2 =
49949 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49950 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49951 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49952 }
49953 }
49954 }
49955
49956 return NewMul;
49957}
49958
49959// Try to form a MULHU or MULHS node by looking for
49960// (srl (mul ext, ext), 16)
49961// TODO: This is X86 specific because we want to be able to handle wide types
49962// before type legalization. But we can only do it if the vector will be
49963// legalized via widening/splitting. Type legalization can't handle promotion
49964// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49965// combiner.
49967 const SDLoc &DL,
49968 const X86Subtarget &Subtarget) {
49969 using namespace SDPatternMatch;
49970 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49971 "SRL or SRA node is required here!");
49972
49973 if (!Subtarget.hasSSE2())
49974 return SDValue();
49975
49976 // Input type should be at least vXi32.
49977 EVT VT = N->getValueType(0);
49978 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49979 return SDValue();
49980
49981 // The operation must be a multiply shifted right by 16.
49982 SDValue LHS, RHS;
49983 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49984 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49985 return SDValue();
49986
49987 unsigned ExtOpc = LHS.getOpcode();
49988 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49989 RHS.getOpcode() != ExtOpc)
49990 return SDValue();
49991
49992 // Peek through the extends.
49993 LHS = LHS.getOperand(0);
49994 RHS = RHS.getOperand(0);
49995
49996 // Ensure the input types match.
49997 EVT MulVT = LHS.getValueType();
49998 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49999 return SDValue();
50000
50001 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
50002 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
50003
50004 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50005 return DAG.getNode(ExtOpc, DL, VT, Mulh);
50006}
50007
50009 const X86Subtarget &Subtarget) {
50010 using namespace llvm::SDPatternMatch;
50011 SDValue N0 = N->getOperand(0);
50012 SDValue N1 = N->getOperand(1);
50014 EVT VT = N0.getValueType();
50015 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50016 SDLoc DL(N);
50017
50018 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50019 // with out-of-bounds clamping.
50020 if (N0.getOpcode() == ISD::VSELECT &&
50021 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50022 SDValue Cond = N0.getOperand(0);
50023 SDValue N00 = N0.getOperand(1);
50024 SDValue N01 = N0.getOperand(2);
50025 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50027 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50029 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50030 }
50031 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50033 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50035 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50036 }
50037 }
50038
50039 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50040 // since the result of setcc_c is all zero's or all ones.
50041 if (VT.isInteger() && !VT.isVector() &&
50042 N1C && N0.getOpcode() == ISD::AND &&
50043 N0.getOperand(1).getOpcode() == ISD::Constant) {
50044 SDValue N00 = N0.getOperand(0);
50045 APInt Mask = N0.getConstantOperandAPInt(1);
50046 Mask <<= N1C->getAPIntValue();
50047 bool MaskOK = false;
50048 // We can handle cases concerning bit-widening nodes containing setcc_c if
50049 // we carefully interrogate the mask to make sure we are semantics
50050 // preserving.
50051 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50052 // of the underlying setcc_c operation if the setcc_c was zero extended.
50053 // Consider the following example:
50054 // zext(setcc_c) -> i32 0x0000FFFF
50055 // c1 -> i32 0x0000FFFF
50056 // c2 -> i32 0x00000001
50057 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50058 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50059 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50060 MaskOK = true;
50061 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50063 MaskOK = true;
50064 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50065 N00.getOpcode() == ISD::ANY_EXTEND) &&
50067 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50068 }
50069 if (MaskOK && Mask != 0)
50070 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50071 }
50072
50073 return SDValue();
50074}
50075
50077 const X86Subtarget &Subtarget) {
50078 using namespace llvm::SDPatternMatch;
50079 SDValue N0 = N->getOperand(0);
50080 SDValue N1 = N->getOperand(1);
50081 EVT VT = N0.getValueType();
50082 unsigned Size = VT.getSizeInBits();
50083 SDLoc DL(N);
50084
50085 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50086 return V;
50087
50088 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50089 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50090 SDValue ShrAmtVal;
50091 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50093 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50094 }
50095
50096 // fold (SRA (SHL X, ShlConst), SraConst)
50097 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50098 // or (sext_in_reg X)
50099 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50100 // depending on relation between SraConst and ShlConst.
50101 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50102 // us to do the sext_in_reg from corresponding bit.
50103
50104 // sexts in X86 are MOVs. The MOVs have the same code size
50105 // as above SHIFTs (only SHIFT on 1 has lower code size).
50106 // However the MOVs have 2 advantages to a SHIFT:
50107 // 1. MOVs can write to a register that differs from source
50108 // 2. MOVs accept memory operands
50109
50110 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50111 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50113 return SDValue();
50114
50115 SDValue N00 = N0.getOperand(0);
50116 SDValue N01 = N0.getOperand(1);
50117 APInt ShlConst = N01->getAsAPIntVal();
50118 APInt SraConst = N1->getAsAPIntVal();
50119 EVT CVT = N1.getValueType();
50120
50121 if (CVT != N01.getValueType())
50122 return SDValue();
50123 if (SraConst.isNegative())
50124 return SDValue();
50125
50126 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50127 unsigned ShiftSize = SVT.getSizeInBits();
50128 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50129 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50130 continue;
50131 SDValue NN =
50132 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50133 if (SraConst.eq(ShlConst))
50134 return NN;
50135 if (SraConst.ult(ShlConst))
50136 return DAG.getNode(ISD::SHL, DL, VT, NN,
50137 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50138 return DAG.getNode(ISD::SRA, DL, VT, NN,
50139 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50140 }
50141 return SDValue();
50142}
50143
50146 const X86Subtarget &Subtarget) {
50147 using namespace llvm::SDPatternMatch;
50148 SDValue N0 = N->getOperand(0);
50149 SDValue N1 = N->getOperand(1);
50150 EVT VT = N0.getValueType();
50151 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50152 SDLoc DL(N);
50153
50154 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50155 return V;
50156
50157 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50158 // with out-of-bounds clamping.
50159 if (N0.getOpcode() == ISD::VSELECT &&
50160 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50161 SDValue Cond = N0.getOperand(0);
50162 SDValue N00 = N0.getOperand(1);
50163 SDValue N01 = N0.getOperand(2);
50164 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50166 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50168 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50169 }
50170 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50172 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50174 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50175 }
50176 }
50177
50178 // Only do this on the last DAG combine as it can interfere with other
50179 // combines.
50180 if (!DCI.isAfterLegalizeDAG())
50181 return SDValue();
50182
50183 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50184 // TODO: This is a generic DAG combine that became an x86-only combine to
50185 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50186 // and-not ('andn').
50187 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50188 return SDValue();
50189
50190 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50191 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50192 if (!ShiftC || !AndC)
50193 return SDValue();
50194
50195 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50196 // transform should reduce code size. It may also enable secondary transforms
50197 // from improved known-bits analysis or instruction selection.
50198 APInt MaskVal = AndC->getAPIntValue();
50199
50200 // If this can be matched by a zero extend, don't optimize.
50201 if (MaskVal.isMask()) {
50202 unsigned TO = MaskVal.countr_one();
50203 if (TO >= 8 && isPowerOf2_32(TO))
50204 return SDValue();
50205 }
50206
50207 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50208 unsigned OldMaskSize = MaskVal.getSignificantBits();
50209 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50210 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50211 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50212 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50213 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50214 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50215 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50216 }
50217 return SDValue();
50218}
50219
50221 const X86Subtarget &Subtarget) {
50222 unsigned Opcode = N->getOpcode();
50223 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50224
50225 SDLoc DL(N);
50226 EVT VT = N->getValueType(0);
50227 SDValue N0 = N->getOperand(0);
50228 SDValue N1 = N->getOperand(1);
50229 EVT SrcVT = N0.getValueType();
50230
50231 SDValue BC0 =
50232 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50233 SDValue BC1 =
50234 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50235
50236 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50237 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50238 // truncation trees that help us avoid lane crossing shuffles.
50239 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50240 // TODO: We don't handle vXf64 shuffles yet.
50241 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50242 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50244 SmallVector<int> ShuffleMask, ScaledMask;
50245 SDValue Vec = peekThroughBitcasts(BCSrc);
50246 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50248 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50249 // shuffle to a v4X64 width - we can probably relax this in the future.
50250 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50251 ShuffleOps[0].getValueType().is256BitVector() &&
50252 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50253 SDValue Lo, Hi;
50254 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50255 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50256 Lo = DAG.getBitcast(SrcVT, Lo);
50257 Hi = DAG.getBitcast(SrcVT, Hi);
50258 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50259 Res = DAG.getBitcast(ShufVT, Res);
50260 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50261 return DAG.getBitcast(VT, Res);
50262 }
50263 }
50264 }
50265 }
50266
50267 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50268 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50269 // If either/both ops are a shuffle that can scale to v2x64,
50270 // then see if we can perform this as a v4x32 post shuffle.
50271 SmallVector<SDValue> Ops0, Ops1;
50272 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50273 bool IsShuf0 =
50274 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50275 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50276 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50277 bool IsShuf1 =
50278 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50279 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50280 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50281 if (IsShuf0 || IsShuf1) {
50282 if (!IsShuf0) {
50283 Ops0.assign({BC0});
50284 ScaledMask0.assign({0, 1});
50285 }
50286 if (!IsShuf1) {
50287 Ops1.assign({BC1});
50288 ScaledMask1.assign({0, 1});
50289 }
50290
50291 SDValue LHS, RHS;
50292 int PostShuffle[4] = {-1, -1, -1, -1};
50293 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50294 if (M < 0)
50295 return true;
50296 Idx = M % 2;
50297 SDValue Src = Ops[M / 2];
50298 if (!LHS || LHS == Src) {
50299 LHS = Src;
50300 return true;
50301 }
50302 if (!RHS || RHS == Src) {
50303 Idx += 2;
50304 RHS = Src;
50305 return true;
50306 }
50307 return false;
50308 };
50309 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50310 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50311 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50312 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50313 LHS = DAG.getBitcast(SrcVT, LHS);
50314 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50315 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50316 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50317 Res = DAG.getBitcast(ShufVT, Res);
50318 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50319 return DAG.getBitcast(VT, Res);
50320 }
50321 }
50322 }
50323
50324 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50325 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50326 SmallVector<int> Mask0, Mask1;
50327 SmallVector<SDValue> Ops0, Ops1;
50328 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50329 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50330 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50331 !Ops0.empty() && !Ops1.empty() &&
50332 all_of(Ops0,
50333 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50334 all_of(Ops1,
50335 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50336 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50337 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50338 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50339 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50340 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50341 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50342 if ((Op00 == Op11) && (Op01 == Op10)) {
50343 std::swap(Op10, Op11);
50345 }
50346 if ((Op00 == Op10) && (Op01 == Op11)) {
50347 const int Map[4] = {0, 2, 1, 3};
50348 SmallVector<int, 4> ShuffleMask(
50349 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50350 Map[ScaledMask1[1]]});
50351 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50352 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50353 DAG.getBitcast(SrcVT, Op01));
50354 Res = DAG.getBitcast(ShufVT, Res);
50355 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50356 return DAG.getBitcast(VT, Res);
50357 }
50358 }
50359 }
50360
50361 return SDValue();
50362}
50363
50366 const X86Subtarget &Subtarget) {
50367 unsigned Opcode = N->getOpcode();
50368 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50369 "Unexpected pack opcode");
50370
50371 EVT VT = N->getValueType(0);
50372 SDValue N0 = N->getOperand(0);
50373 SDValue N1 = N->getOperand(1);
50374 unsigned NumDstElts = VT.getVectorNumElements();
50375 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50376 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50377 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50378 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50379 "Unexpected PACKSS/PACKUS input type");
50380
50381 bool IsSigned = (X86ISD::PACKSS == Opcode);
50382
50383 // Constant Folding.
50384 APInt UndefElts0, UndefElts1;
50385 SmallVector<APInt, 32> EltBits0, EltBits1;
50386 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50387 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50388 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50389 /*AllowWholeUndefs*/ true,
50390 /*AllowPartialUndefs*/ true) &&
50391 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50392 /*AllowWholeUndefs*/ true,
50393 /*AllowPartialUndefs*/ true)) {
50394 unsigned NumLanes = VT.getSizeInBits() / 128;
50395 unsigned NumSrcElts = NumDstElts / 2;
50396 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50397 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50398
50399 APInt Undefs(NumDstElts, 0);
50400 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50401 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50402 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50403 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50404 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50405 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50406
50407 if (UndefElts[SrcIdx]) {
50408 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50409 continue;
50410 }
50411
50412 APInt &Val = EltBits[SrcIdx];
50413 if (IsSigned) {
50414 // PACKSS: Truncate signed value with signed saturation.
50415 // Source values less than dst minint are saturated to minint.
50416 // Source values greater than dst maxint are saturated to maxint.
50417 Val = Val.truncSSat(DstBitsPerElt);
50418 } else {
50419 // PACKUS: Truncate signed value with unsigned saturation.
50420 // Source values less than zero are saturated to zero.
50421 // Source values greater than dst maxuint are saturated to maxuint.
50422 // NOTE: This is different from APInt::truncUSat.
50423 if (Val.isIntN(DstBitsPerElt))
50424 Val = Val.trunc(DstBitsPerElt);
50425 else if (Val.isNegative())
50426 Val = APInt::getZero(DstBitsPerElt);
50427 else
50428 Val = APInt::getAllOnes(DstBitsPerElt);
50429 }
50430 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50431 }
50432 }
50433
50434 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50435 }
50436
50437 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50438 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50439 return V;
50440
50441 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50442 // Currently limit this to allsignbits cases only.
50443 if (IsSigned &&
50444 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50445 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50446 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50447 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50448 if (Not0 && Not1) {
50449 SDLoc DL(N);
50450 MVT SrcVT = N0.getSimpleValueType();
50451 SDValue Pack =
50452 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50453 DAG.getBitcast(SrcVT, Not1));
50454 return DAG.getNOT(DL, Pack, VT);
50455 }
50456 }
50457
50458 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50459 // truncate to create a larger truncate.
50460 if (Subtarget.hasAVX512() &&
50461 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50462 N0.getOperand(0).getValueType() == MVT::v8i32) {
50463 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50464 (!IsSigned &&
50465 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50466 if (Subtarget.hasVLX())
50467 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50468
50469 // Widen input to v16i32 so we can truncate that.
50470 SDLoc dl(N);
50471 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50472 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50473 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50474 }
50475 }
50476
50477 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50478 if (VT.is128BitVector()) {
50479 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50480 SDValue Src0, Src1;
50481 if (N0.getOpcode() == ExtOpc &&
50483 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50484 Src0 = N0.getOperand(0);
50485 }
50486 if (N1.getOpcode() == ExtOpc &&
50488 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50489 Src1 = N1.getOperand(0);
50490 }
50491 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50492 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50493 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50494 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50495 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50496 }
50497
50498 // Try again with pack(*_extend_vector_inreg, undef).
50499 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50501 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50502 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50503 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50504 DAG);
50505 }
50506
50507 // Attempt to combine as shuffle.
50508 SDValue Op(N, 0);
50509 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50510 return Res;
50511
50512 return SDValue();
50513}
50514
50517 const X86Subtarget &Subtarget) {
50518 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50519 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50520 "Unexpected horizontal add/sub opcode");
50521
50522 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50523 MVT VT = N->getSimpleValueType(0);
50524 SDValue LHS = N->getOperand(0);
50525 SDValue RHS = N->getOperand(1);
50526
50527 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50528 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50529 LHS.getOpcode() == RHS.getOpcode() &&
50530 LHS.getValueType() == RHS.getValueType() &&
50531 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50532 SDValue LHS0 = LHS.getOperand(0);
50533 SDValue LHS1 = LHS.getOperand(1);
50534 SDValue RHS0 = RHS.getOperand(0);
50535 SDValue RHS1 = RHS.getOperand(1);
50536 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50537 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50538 SDLoc DL(N);
50539 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50540 LHS0.isUndef() ? LHS1 : LHS0,
50541 RHS0.isUndef() ? RHS1 : RHS0);
50542 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50543 Res = DAG.getBitcast(ShufVT, Res);
50544 SDValue NewLHS =
50545 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50546 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50547 SDValue NewRHS =
50548 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50549 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50550 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50551 DAG.getBitcast(VT, NewRHS));
50552 }
50553 }
50554 }
50555
50556 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50557 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50558 return V;
50559
50560 return SDValue();
50561}
50562
50565 const X86Subtarget &Subtarget) {
50566 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50567 X86ISD::VSRL == N->getOpcode()) &&
50568 "Unexpected shift opcode");
50569 EVT VT = N->getValueType(0);
50570 SDValue N0 = N->getOperand(0);
50571 SDValue N1 = N->getOperand(1);
50572
50573 // Shift zero -> zero.
50575 return DAG.getConstant(0, SDLoc(N), VT);
50576
50577 // Detect constant shift amounts.
50578 APInt UndefElts;
50579 SmallVector<APInt, 32> EltBits;
50580 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50581 /*AllowWholeUndefs*/ true,
50582 /*AllowPartialUndefs*/ false)) {
50583 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50584 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50585 EltBits[0].getZExtValue(), DAG);
50586 }
50587
50588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50589 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50590 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50591 return SDValue(N, 0);
50592
50593 return SDValue();
50594}
50595
50598 const X86Subtarget &Subtarget) {
50599 unsigned Opcode = N->getOpcode();
50600 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50601 X86ISD::VSRLI == Opcode) &&
50602 "Unexpected shift opcode");
50603 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50604 EVT VT = N->getValueType(0);
50605 SDValue N0 = N->getOperand(0);
50606 SDValue N1 = N->getOperand(1);
50607 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50608 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50609 "Unexpected value type");
50610 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50611
50612 // (shift undef, X) -> 0
50613 if (N0.isUndef())
50614 return DAG.getConstant(0, SDLoc(N), VT);
50615
50616 // Out of range logical bit shifts are guaranteed to be zero.
50617 // Out of range arithmetic bit shifts splat the sign bit.
50618 unsigned ShiftVal = N->getConstantOperandVal(1);
50619 if (ShiftVal >= NumBitsPerElt) {
50620 if (LogicalShift)
50621 return DAG.getConstant(0, SDLoc(N), VT);
50622 ShiftVal = NumBitsPerElt - 1;
50623 }
50624
50625 // (shift X, 0) -> X
50626 if (!ShiftVal)
50627 return N0;
50628
50629 // (shift 0, C) -> 0
50631 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50632 // result are all zeros, not undef.
50633 return DAG.getConstant(0, SDLoc(N), VT);
50634
50635 // (VSRAI -1, C) -> -1
50636 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50637 // N0 is all ones or undef. We guarantee that the bits shifted into the
50638 // result are all ones, not undef.
50639 return DAG.getAllOnesConstant(SDLoc(N), VT);
50640
50641 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50642 unsigned NewShiftVal = Amt0 + Amt1;
50643 if (NewShiftVal >= NumBitsPerElt) {
50644 // Out of range logical bit shifts are guaranteed to be zero.
50645 // Out of range arithmetic bit shifts splat the sign bit.
50646 if (LogicalShift)
50647 return DAG.getConstant(0, SDLoc(N), VT);
50648 NewShiftVal = NumBitsPerElt - 1;
50649 }
50650 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50651 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50652 };
50653
50654 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50655 if (Opcode == N0.getOpcode())
50656 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50657
50658 // (shl (add X, X), C) -> (shl X, (C + 1))
50659 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50660 N0.getOperand(0) == N0.getOperand(1))
50661 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50662
50663 // We can decode 'whole byte' logical bit shifts as shuffles.
50664 if (LogicalShift && (ShiftVal % 8) == 0) {
50665 SDValue Op(N, 0);
50666 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50667 return Res;
50668 }
50669
50670 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50671 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50672 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50673 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50674 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50675 N0.getOpcode() == X86ISD::PSHUFD &&
50676 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50677 N0->hasOneUse()) {
50679 if (BC.getOpcode() == X86ISD::VSHLI &&
50680 BC.getScalarValueSizeInBits() == 64 &&
50681 BC.getConstantOperandVal(1) == 63) {
50682 SDLoc DL(N);
50683 SDValue Src = BC.getOperand(0);
50684 Src = DAG.getBitcast(VT, Src);
50685 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50686 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50687 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50688 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50689 return Src;
50690 }
50691 }
50692
50693 auto TryConstantFold = [&](SDValue V) {
50694 APInt UndefElts;
50695 SmallVector<APInt, 32> EltBits;
50696 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50697 /*AllowWholeUndefs*/ true,
50698 /*AllowPartialUndefs*/ true))
50699 return SDValue();
50700 assert(EltBits.size() == VT.getVectorNumElements() &&
50701 "Unexpected shift value type");
50702 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50703 // created an undef input due to no input bits being demanded, but user
50704 // still expects 0 in other bits.
50705 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50706 APInt &Elt = EltBits[i];
50707 if (UndefElts[i])
50708 Elt = 0;
50709 else if (X86ISD::VSHLI == Opcode)
50710 Elt <<= ShiftVal;
50711 else if (X86ISD::VSRAI == Opcode)
50712 Elt.ashrInPlace(ShiftVal);
50713 else
50714 Elt.lshrInPlace(ShiftVal);
50715 }
50716 // Reset undef elements since they were zeroed above.
50717 UndefElts = 0;
50718 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50719 };
50720
50721 // Constant Folding.
50722 if (N->isOnlyUserOf(N0.getNode())) {
50723 if (SDValue C = TryConstantFold(N0))
50724 return C;
50725
50726 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50727 // Don't break NOT patterns.
50729 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50730 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50732 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50733 SDLoc DL(N);
50734 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50735 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50736 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50737 }
50738 }
50739 }
50740
50741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50742 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50743 DCI))
50744 return SDValue(N, 0);
50745
50746 return SDValue();
50747}
50748
50751 const X86Subtarget &Subtarget) {
50752 EVT VT = N->getValueType(0);
50753 unsigned Opcode = N->getOpcode();
50754 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50755 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50756 Opcode == ISD::INSERT_VECTOR_ELT) &&
50757 "Unexpected vector insertion");
50758
50759 SDValue Vec = N->getOperand(0);
50760 SDValue Scl = N->getOperand(1);
50761 SDValue Idx = N->getOperand(2);
50762
50763 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50764 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50765 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50766
50767 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50768 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50769 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50770 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50771 APInt::getAllOnes(NumBitsPerElt), DCI))
50772 return SDValue(N, 0);
50773 }
50774
50775 // Attempt to combine insertion patterns to a shuffle.
50776 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50777 SDValue Op(N, 0);
50778 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50779 return Res;
50780 }
50781
50782 return SDValue();
50783}
50784
50785/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50786/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50787/// OR -> CMPNEQSS.
50790 const X86Subtarget &Subtarget) {
50791 unsigned opcode;
50792
50793 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50794 // we're requiring SSE2 for both.
50795 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50796 SDValue N0 = N->getOperand(0);
50797 SDValue N1 = N->getOperand(1);
50798 SDValue CMP0 = N0.getOperand(1);
50799 SDValue CMP1 = N1.getOperand(1);
50800 SDLoc DL(N);
50801
50802 // The SETCCs should both refer to the same CMP.
50803 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50804 return SDValue();
50805
50806 SDValue CMP00 = CMP0->getOperand(0);
50807 SDValue CMP01 = CMP0->getOperand(1);
50808 EVT VT = CMP00.getValueType();
50809
50810 if (VT == MVT::f32 || VT == MVT::f64 ||
50811 (VT == MVT::f16 && Subtarget.hasFP16())) {
50812 bool ExpectingFlags = false;
50813 // Check for any users that want flags:
50814 for (const SDNode *U : N->users()) {
50815 if (ExpectingFlags)
50816 break;
50817
50818 switch (U->getOpcode()) {
50819 default:
50820 case ISD::BR_CC:
50821 case ISD::BRCOND:
50822 case ISD::SELECT:
50823 ExpectingFlags = true;
50824 break;
50825 case ISD::CopyToReg:
50826 case ISD::SIGN_EXTEND:
50827 case ISD::ZERO_EXTEND:
50828 case ISD::ANY_EXTEND:
50829 break;
50830 }
50831 }
50832
50833 if (!ExpectingFlags) {
50834 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50835 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50836
50837 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50838 X86::CondCode tmp = cc0;
50839 cc0 = cc1;
50840 cc1 = tmp;
50841 }
50842
50843 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50844 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50845 // FIXME: need symbolic constants for these magic numbers.
50846 // See X86ATTInstPrinter.cpp:printSSECC().
50847 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50848 if (Subtarget.hasAVX512()) {
50849 SDValue FSetCC =
50850 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50851 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50852 // Need to fill with zeros to ensure the bitcast will produce zeroes
50853 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50854 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50855 DAG.getConstant(0, DL, MVT::v16i1),
50856 FSetCC, DAG.getVectorIdxConstant(0, DL));
50857 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50858 N->getSimpleValueType(0));
50859 }
50860 SDValue OnesOrZeroesF =
50861 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50862 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50863
50864 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50865 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50866
50867 if (is64BitFP && !Subtarget.is64Bit()) {
50868 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50869 // 64-bit integer, since that's not a legal type. Since
50870 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50871 // bits, but can do this little dance to extract the lowest 32 bits
50872 // and work with those going forward.
50873 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50874 MVT::v2f64, OnesOrZeroesF);
50875 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50876 OnesOrZeroesF =
50877 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50878 DAG.getVectorIdxConstant(0, DL));
50879 IntVT = MVT::i32;
50880 }
50881
50882 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50883 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50884 DAG.getConstant(1, DL, IntVT));
50885 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50886 ANDed);
50887 return OneBitOfTruth;
50888 }
50889 }
50890 }
50891 }
50892 return SDValue();
50893}
50894
50895/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50897 SelectionDAG &DAG) {
50898 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50899
50900 MVT VT = N->getSimpleValueType(0);
50901 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50902 return SDValue();
50903
50904 SDValue X, Y;
50905 SDValue N0 = N->getOperand(0);
50906 SDValue N1 = N->getOperand(1);
50907
50908 if (SDValue Not = IsNOT(N0, DAG)) {
50909 X = Not;
50910 Y = N1;
50911 } else if (SDValue Not = IsNOT(N1, DAG)) {
50912 X = Not;
50913 Y = N0;
50914 } else
50915 return SDValue();
50916
50917 X = DAG.getBitcast(VT, X);
50918 Y = DAG.getBitcast(VT, Y);
50919 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50920}
50921
50922/// Try to fold:
50923/// and (vector_shuffle<Z,...,Z>
50924/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50925/// ->
50926/// andnp (vector_shuffle<Z,...,Z>
50927/// (insert_vector_elt undef, X, Z), undef), Y
50929 const X86Subtarget &Subtarget) {
50930 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50931
50932 EVT VT = N->getValueType(0);
50933 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50934 // value and require extra moves.
50935 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50936 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50937 return SDValue();
50938
50939 auto GetNot = [&DAG](SDValue V) {
50941 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50942 // end-users are ISD::AND including cases
50943 // (and(extract_vector_element(SVN), Y)).
50944 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50945 !SVN->getOperand(1).isUndef()) {
50946 return SDValue();
50947 }
50948 SDValue IVEN = SVN->getOperand(0);
50949 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50950 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50951 return SDValue();
50952 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50953 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50954 return SDValue();
50955 SDValue Src = IVEN.getOperand(1);
50956 if (SDValue Not = IsNOT(Src, DAG)) {
50957 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50958 SDValue NotIVEN =
50960 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50961 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50962 SVN->getOperand(1), SVN->getMask());
50963 }
50964 return SDValue();
50965 };
50966
50967 SDValue X, Y;
50968 SDValue N0 = N->getOperand(0);
50969 SDValue N1 = N->getOperand(1);
50970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50971
50972 if (SDValue Not = GetNot(N0)) {
50973 X = Not;
50974 Y = N1;
50975 } else if (SDValue Not = GetNot(N1)) {
50976 X = Not;
50977 Y = N0;
50978 } else
50979 return SDValue();
50980
50981 X = DAG.getBitcast(VT, X);
50982 Y = DAG.getBitcast(VT, Y);
50983 SDLoc DL(N);
50984
50985 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50986 // AVX2.
50987 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50989 SDValue LoX, HiX;
50990 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50991 SDValue LoY, HiY;
50992 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50993 EVT SplitVT = LoX.getValueType();
50994 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50995 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50996 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50997 }
50998
50999 if (TLI.isTypeLegal(VT))
51000 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
51001
51002 return SDValue();
51003}
51004
51005// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
51006// logical operations, like in the example below.
51007// or (and (truncate x, truncate y)),
51008// (xor (truncate z, build_vector (constants)))
51009// Given a target type \p VT, we generate
51010// or (and x, y), (xor z, zext(build_vector (constants)))
51011// given x, y and z are of type \p VT. We can do so, if operands are either
51012// truncates from VT types, the second operand is a vector of constants, can
51013// be recursively promoted or is an existing extension we can extend further.
51015 SelectionDAG &DAG,
51016 const X86Subtarget &Subtarget,
51017 unsigned Depth) {
51018 // Limit recursion to avoid excessive compile times.
51020 return SDValue();
51021
51022 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51023 return SDValue();
51024
51025 SDValue N0 = N.getOperand(0);
51026 SDValue N1 = N.getOperand(1);
51027
51028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51029 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51030 return SDValue();
51031
51032 if (SDValue NN0 =
51033 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51034 N0 = NN0;
51035 else {
51036 // The left side has to be a 'trunc'.
51037 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51038 N0.getOperand(0).getValueType() == VT;
51039 if (LHSTrunc)
51040 N0 = N0.getOperand(0);
51041 else
51042 return SDValue();
51043 }
51044
51045 if (SDValue NN1 =
51046 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51047 N1 = NN1;
51048 else {
51049 // The right side has to be a 'trunc', a (foldable) constant or an
51050 // existing extension we can extend further.
51051 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51052 N1.getOperand(0).getValueType() == VT;
51053 if (RHSTrunc)
51054 N1 = N1.getOperand(0);
51055 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51056 Subtarget.hasInt256() && N1.hasOneUse())
51057 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51058 else if (SDValue Cst =
51060 N1 = Cst;
51061 else
51062 return SDValue();
51063 }
51064
51065 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51066}
51067
51068// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51069// register. In most cases we actually compare or select YMM-sized registers
51070// and mixing the two types creates horrible code. This method optimizes
51071// some of the transition sequences.
51072// Even with AVX-512 this is still useful for removing casts around logical
51073// operations on vXi1 mask types.
51075 SelectionDAG &DAG,
51076 const X86Subtarget &Subtarget) {
51077 EVT VT = N.getValueType();
51078 assert(VT.isVector() && "Expected vector type");
51079 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51080 N.getOpcode() == ISD::ZERO_EXTEND ||
51081 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51082
51083 SDValue Narrow = N.getOperand(0);
51084 EVT NarrowVT = Narrow.getValueType();
51085
51086 // Generate the wide operation.
51087 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51088 if (!Op)
51089 return SDValue();
51090 switch (N.getOpcode()) {
51091 default: llvm_unreachable("Unexpected opcode");
51092 case ISD::ANY_EXTEND:
51093 return Op;
51094 case ISD::ZERO_EXTEND:
51095 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51096 case ISD::SIGN_EXTEND:
51097 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51098 Op, DAG.getValueType(NarrowVT));
51099 }
51100}
51101
51102static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51103 unsigned FPOpcode;
51104 switch (Opcode) {
51105 // clang-format off
51106 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51107 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51108 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51109 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51110 // clang-format on
51111 }
51112 return FPOpcode;
51113}
51114
51115/// If both input operands of a logic op are being cast from floating-point
51116/// types or FP compares, try to convert this into a floating-point logic node
51117/// to avoid unnecessary moves from SSE to integer registers.
51118static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51119 SDValue N0, SDValue N1,
51120 SelectionDAG &DAG,
51122 const X86Subtarget &Subtarget) {
51123 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51124 "Unexpected bit opcode");
51125
51126 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51127 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51128 return SDValue();
51129
51130 SDValue N00 = N0.getOperand(0);
51131 SDValue N10 = N1.getOperand(0);
51132 EVT N00Type = N00.getValueType();
51133 EVT N10Type = N10.getValueType();
51134
51135 // Ensure that both types are the same and are legal scalar fp types.
51136 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51137 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51138 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51139 return SDValue();
51140
51141 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51142 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51143 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51144 return DAG.getBitcast(VT, FPLogic);
51145 }
51146
51147 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51148 !N1.hasOneUse())
51149 return SDValue();
51150
51151 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51152 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51153
51154 // The vector ISA for FP predicates is incomplete before AVX, so converting
51155 // COMIS* to CMPS* may not be a win before AVX.
51156 if (!Subtarget.hasAVX() &&
51157 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51158 return SDValue();
51159
51160 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51161 // and vector logic:
51162 // logic (setcc N00, N01), (setcc N10, N11) -->
51163 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51164 unsigned NumElts = 128 / N00Type.getSizeInBits();
51165 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51166 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51167 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51168 SDValue N01 = N0.getOperand(1);
51169 SDValue N11 = N1.getOperand(1);
51170 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51171 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51172 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51173 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51174 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51175 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51176 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51177 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51178}
51179
51180// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51181// to reduce XMM->GPR traffic.
51182static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51183 SDValue N1, SelectionDAG &DAG) {
51184 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51185 "Unexpected bit opcode");
51186
51187 // Both operands must be single use MOVMSK.
51188 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51189 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51190 return SDValue();
51191
51192 SDValue Vec0 = N0.getOperand(0);
51193 SDValue Vec1 = N1.getOperand(0);
51194 EVT VecVT0 = Vec0.getValueType();
51195 EVT VecVT1 = Vec1.getValueType();
51196
51197 // Both MOVMSK operands must be from vectors of the same size and same element
51198 // size, but its OK for a fp/int diff.
51199 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51200 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51201 return SDValue();
51202
51203 unsigned VecOpc =
51205 SDValue Result =
51206 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51207 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51208}
51209
51210// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51211// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51212// handles in InstCombine.
51213static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51214 SDValue N0, SDValue N1,
51215 SelectionDAG &DAG) {
51216 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51217 "Unexpected bit opcode");
51218
51219 // Both operands must be single use.
51220 if (!N0.hasOneUse() || !N1.hasOneUse())
51221 return SDValue();
51222
51223 // Search for matching shifts.
51226
51227 unsigned BCOpc = BC0.getOpcode();
51228 EVT BCVT = BC0.getValueType();
51229 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51230 return SDValue();
51231
51232 switch (BCOpc) {
51233 case X86ISD::VSHLI:
51234 case X86ISD::VSRLI:
51235 case X86ISD::VSRAI: {
51236 if (BC0.getOperand(1) != BC1.getOperand(1))
51237 return SDValue();
51238 SDValue BitOp =
51239 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51240 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51241 return DAG.getBitcast(VT, Shift);
51242 }
51243 }
51244
51245 return SDValue();
51246}
51247
51248// Attempt to fold:
51249// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51250// TODO: Handle PACKUS handling.
51251static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51252 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51253 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51254 "Unexpected bit opcode");
51255
51256 // Both operands must be single use.
51257 if (!N0.hasOneUse() || !N1.hasOneUse())
51258 return SDValue();
51259
51260 // Search for matching packs.
51263
51264 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51265 return SDValue();
51266
51267 MVT DstVT = N0.getSimpleValueType();
51268 if (DstVT != N1.getSimpleValueType())
51269 return SDValue();
51270
51271 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51272 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51273
51274 // Limit to allsignbits packing.
51275 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51276 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51277 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51278 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51279 return SDValue();
51280
51281 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51282 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51283 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51284}
51285
51286/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51287/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51288/// with a shift-right to eliminate loading the vector constant mask value.
51290 SelectionDAG &DAG,
51291 const X86Subtarget &Subtarget) {
51292 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51293 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51294 EVT VT = Op0.getValueType();
51295 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51296 return SDValue();
51297
51298 // Try to convert an "is positive" signbit masking operation into arithmetic
51299 // shift and "andn". This saves a materialization of a -1 vector constant.
51300 // The "is negative" variant should be handled more generally because it only
51301 // requires "and" rather than "andn":
51302 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51303 //
51304 // This is limited to the original type to avoid producing even more bitcasts.
51305 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51306 // will be profitable.
51307 if (N->getValueType(0) == VT &&
51308 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51309 SDValue X, Y;
51310 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51311 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51312 X = Op1.getOperand(0);
51313 Y = Op0;
51314 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51315 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51316 X = Op0.getOperand(0);
51317 Y = Op1;
51318 }
51319 if (X && Y) {
51320 SDValue Sra =
51322 VT.getScalarSizeInBits() - 1, DAG);
51323 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51324 }
51325 }
51326
51327 APInt SplatVal;
51328 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51329 return SDValue();
51330
51331 // Don't prevent creation of ANDN.
51332 if (isBitwiseNot(Op0))
51333 return SDValue();
51334
51335 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51336 return SDValue();
51337
51338 unsigned EltBitWidth = VT.getScalarSizeInBits();
51339 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51340 return SDValue();
51341
51342 unsigned ShiftVal = SplatVal.countr_one();
51343 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51344 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51345 return DAG.getBitcast(N->getValueType(0), Shift);
51346}
51347
51348// Get the index node from the lowered DAG of a GEP IR instruction with one
51349// indexing dimension.
51351 if (Ld->isIndexed())
51352 return SDValue();
51353
51354 SDValue Base = Ld->getBasePtr();
51355 if (Base.getOpcode() != ISD::ADD)
51356 return SDValue();
51357
51358 SDValue ShiftedIndex = Base.getOperand(0);
51359 if (ShiftedIndex.getOpcode() != ISD::SHL)
51360 return SDValue();
51361
51362 return ShiftedIndex.getOperand(0);
51363}
51364
51365static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51366 return Subtarget.hasBMI2() &&
51367 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51368}
51369
51370/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51371/// This undoes the inverse fold performed in InstCombine
51373 SelectionDAG &DAG) {
51374 using namespace llvm::SDPatternMatch;
51375 MVT VT = N->getSimpleValueType(0);
51376 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51377 return SDValue();
51378
51379 SDValue X, Y, Z;
51380 if (sd_match(N, m_And(m_Value(X),
51381 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51382 // Don't fold if Y or Z are constants to prevent infinite loops.
51385 return DAG.getNode(
51386 ISD::AND, DL, VT, X,
51387 DAG.getNOT(
51388 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51389 }
51390
51391 return SDValue();
51392}
51393
51394// This function recognizes cases where X86 bzhi instruction can replace and
51395// 'and-load' sequence.
51396// In case of loading integer value from an array of constants which is defined
51397// as follows:
51398//
51399// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51400//
51401// then applying a bitwise and on the result with another input.
51402// It's equivalent to performing bzhi (zero high bits) on the input, with the
51403// same index of the load.
51405 const X86Subtarget &Subtarget) {
51406 MVT VT = Node->getSimpleValueType(0);
51407 SDLoc dl(Node);
51408
51409 // Check if subtarget has BZHI instruction for the node's type
51410 if (!hasBZHI(Subtarget, VT))
51411 return SDValue();
51412
51413 // Try matching the pattern for both operands.
51414 for (unsigned i = 0; i < 2; i++) {
51415 // continue if the operand is not a load instruction
51416 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51417 if (!Ld)
51418 continue;
51419 const Value *MemOp = Ld->getMemOperand()->getValue();
51420 if (!MemOp)
51421 continue;
51422 // Get the Node which indexes into the array.
51424 if (!Index)
51425 continue;
51426
51427 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51428 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51429 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51430 Constant *Init = GV->getInitializer();
51431 Type *Ty = Init->getType();
51433 !Ty->getArrayElementType()->isIntegerTy() ||
51434 Ty->getArrayElementType()->getScalarSizeInBits() !=
51435 VT.getSizeInBits() ||
51436 Ty->getArrayNumElements() >
51437 Ty->getArrayElementType()->getScalarSizeInBits())
51438 continue;
51439
51440 // Check if the array's constant elements are suitable to our case.
51441 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51442 bool ConstantsMatch = true;
51443 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51444 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51445 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51446 ConstantsMatch = false;
51447 break;
51448 }
51449 }
51450 if (!ConstantsMatch)
51451 continue;
51452
51453 // Do the transformation (For 32-bit type):
51454 // -> (and (load arr[idx]), inp)
51455 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51456 // that will be replaced with one bzhi instruction.
51457 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51458 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51459
51460 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51461 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51462 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51463
51464 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51465 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51466 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51467 }
51468 }
51469 }
51470 }
51471 return SDValue();
51472}
51473
51474// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51475// Where C is a mask containing the same number of bits as the setcc and
51476// where the setcc will freely 0 upper bits of k-register. We can replace the
51477// undef in the concat with 0s and remove the AND. This mainly helps with
51478// v2i1/v4i1 setcc being casted to scalar.
51480 const X86Subtarget &Subtarget) {
51481 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51482
51483 EVT VT = N->getValueType(0);
51484
51485 // Make sure this is an AND with constant. We will check the value of the
51486 // constant later.
51487 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51488 if (!C1)
51489 return SDValue();
51490
51491 // This is implied by the ConstantSDNode.
51492 assert(!VT.isVector() && "Expected scalar VT!");
51493
51494 SDValue Src = N->getOperand(0);
51495 if (!Src.hasOneUse())
51496 return SDValue();
51497
51498 // (Optionally) peek through any_extend().
51499 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51500 if (!Src.getOperand(0).hasOneUse())
51501 return SDValue();
51502 Src = Src.getOperand(0);
51503 }
51504
51505 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51506 return SDValue();
51507
51508 Src = Src.getOperand(0);
51509 EVT SrcVT = Src.getValueType();
51510
51511 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51512 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51513 !TLI.isTypeLegal(SrcVT))
51514 return SDValue();
51515
51516 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51517 return SDValue();
51518
51519 // We only care about the first subvector of the concat, we expect the
51520 // other subvectors to be ignored due to the AND if we make the change.
51521 SDValue SubVec = Src.getOperand(0);
51522 EVT SubVecVT = SubVec.getValueType();
51523
51524 // The RHS of the AND should be a mask with as many bits as SubVec.
51525 if (!TLI.isTypeLegal(SubVecVT) ||
51526 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51527 return SDValue();
51528
51529 // First subvector should be a setcc with a legal result type or a
51530 // AND containing at least one setcc with a legal result type.
51531 auto IsLegalSetCC = [&](SDValue V) {
51532 if (V.getOpcode() != ISD::SETCC)
51533 return false;
51534 EVT SetccVT = V.getOperand(0).getValueType();
51535 if (!TLI.isTypeLegal(SetccVT) ||
51536 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51537 return false;
51538 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51539 return false;
51540 return true;
51541 };
51542 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51543 (IsLegalSetCC(SubVec.getOperand(0)) ||
51544 IsLegalSetCC(SubVec.getOperand(1))))))
51545 return SDValue();
51546
51547 // We passed all the checks. Rebuild the concat_vectors with zeroes
51548 // and cast it back to VT.
51549 SDLoc dl(N);
51550 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51551 DAG.getConstant(0, dl, SubVecVT));
51552 Ops[0] = SubVec;
51553 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51554 Ops);
51555 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51556 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51557}
51558
51560 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51561 // We don't want to go crazy with the recursion here. This isn't a super
51562 // important optimization.
51563 static constexpr unsigned kMaxDepth = 2;
51564
51565 // Only do this re-ordering if op has one use.
51566 if (!Op.hasOneUse())
51567 return SDValue();
51568
51569 SDLoc DL(Op);
51570 // If we hit another assosiative op, recurse further.
51571 if (Op.getOpcode() == Opc) {
51572 // Done recursing.
51573 if (Depth++ >= kMaxDepth)
51574 return SDValue();
51575
51576 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51577 if (SDValue R =
51578 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51579 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51580 Op.getOperand(1 - OpIdx));
51581
51582 } else if (Op.getOpcode() == ISD::SUB) {
51583 if (Opc == ISD::AND) {
51584 // BLSI: (and x, (sub 0, x))
51585 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51586 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51587 }
51588 // Opc must be ISD::AND or ISD::XOR
51589 // BLSR: (and x, (sub x, 1))
51590 // BLSMSK: (xor x, (sub x, 1))
51591 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51592 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51593
51594 } else if (Op.getOpcode() == ISD::ADD) {
51595 // Opc must be ISD::AND or ISD::XOR
51596 // BLSR: (and x, (add x, -1))
51597 // BLSMSK: (xor x, (add x, -1))
51598 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51599 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51600 }
51601 return SDValue();
51602}
51603
51605 const X86Subtarget &Subtarget) {
51606 EVT VT = N->getValueType(0);
51607 // Make sure this node is a candidate for BMI instructions.
51608 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51609 (VT != MVT::i32 && VT != MVT::i64))
51610 return SDValue();
51611
51612 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51613
51614 // Try and match LHS and RHS.
51615 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51616 if (SDValue OpMatch =
51617 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51618 N->getOperand(1 - OpIdx), 0))
51619 return OpMatch;
51620 return SDValue();
51621}
51622
51623/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51625 SelectionDAG &DAG,
51626 const X86Subtarget &Subtarget) {
51627 using namespace llvm::SDPatternMatch;
51628
51629 EVT VT = And->getValueType(0);
51630 // Make sure this node is a candidate for BMI instructions.
51631 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51632 return SDValue();
51633
51634 SDValue X;
51635 SDValue Y;
51638 m_Value(Y))))
51639 return SDValue();
51640
51641 SDValue BLSMSK =
51642 DAG.getNode(ISD::XOR, DL, VT, X,
51643 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51644 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51645 return AndN;
51646}
51647
51649 SelectionDAG &DAG,
51651 const X86Subtarget &ST) {
51652 // cmp(setcc(cc, X), 0)
51653 // brcond ne
51654 // ->
51655 // X
51656 // brcond cc
51657
51658 // sub(setcc(cc, X), 1)
51659 // brcond ne
51660 // ->
51661 // X
51662 // brcond ~cc
51663 //
51664 // if only flag has users
51665
51666 SDValue SetCC = N->getOperand(0);
51667
51668 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51669 return SDValue();
51670
51671 // Check the only user of flag is `brcond ne`.
51672 SDNode *BrCond = *Flag->user_begin();
51673 if (BrCond->getOpcode() != X86ISD::BRCOND)
51674 return SDValue();
51675 unsigned CondNo = 2;
51676 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51678 return SDValue();
51679
51680 SDValue X = SetCC.getOperand(1);
51681 // sub has two results while X only have one. DAG combine assumes the value
51682 // type matches.
51683 if (N->getOpcode() == X86ISD::SUB)
51684 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51685
51686 SDValue CCN = SetCC.getOperand(0);
51687 X86::CondCode CC =
51688 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51690 // Update CC for the consumer of the flag.
51691 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51692 // checking if the second condition evaluates to true. When comparing the
51693 // result with 1, we are checking uf the second condition evaluates to false.
51695 if (isNullConstant(N->getOperand(1)))
51696 Ops[CondNo] = CCN;
51697 else if (isOneConstant(N->getOperand(1)))
51698 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51699 else
51700 llvm_unreachable("expect constant 0 or 1");
51701
51702 SDValue NewBrCond =
51703 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51704 // Avoid self-assign error b/c CC1 can be `e/ne`.
51705 if (BrCond != NewBrCond.getNode())
51706 DCI.CombineTo(BrCond, NewBrCond);
51707 return X;
51708}
51709
51712 const X86Subtarget &ST) {
51713 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51714 // ->
51715 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51716
51717 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51718 // ->
51719 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51720 //
51721 // where cflags is determined by cc1.
51722
51723 if (!ST.hasCCMP())
51724 return SDValue();
51725
51726 SDValue SetCC0 = N->getOperand(0);
51727 SDValue SetCC1 = N->getOperand(1);
51728 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51729 SetCC1.getOpcode() != X86ISD::SETCC)
51730 return SDValue();
51731
51732 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51733 SDValue Op = V.getOperand(1);
51734 unsigned Opc = Op.getOpcode();
51735 if (Opc == X86ISD::SUB)
51736 return X86ISD::CCMP;
51737 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51738 return X86ISD::CTEST;
51739 return 0U;
51740 };
51741
51742 unsigned NewOpc = 0;
51743
51744 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51745 // appear on the right.
51746 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51747 std::swap(SetCC0, SetCC1);
51748 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51749 return SDValue();
51750 }
51751
51752 X86::CondCode CC0 =
51753 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51754 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51755 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51756 return SDValue();
51757
51758 bool IsOR = N->getOpcode() == ISD::OR;
51759
51760 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51761 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51762 // operator is OR. Similar for CC1.
51763 SDValue SrcCC =
51765 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51766 : SetCC0.getOperand(0);
51767 SDValue CC1N = SetCC1.getOperand(0);
51768 X86::CondCode CC1 =
51769 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51771 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51772 SDLoc DL(N);
51773 SDValue CFlags = DAG.getTargetConstant(
51774 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51775 SDValue Sub = SetCC1.getOperand(1);
51776
51777 // Replace any uses of the old flag produced by SUB/CMP with the new one
51778 // produced by CCMP/CTEST.
51779 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51780 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51781 {Sub.getOperand(0), Sub.getOperand(1),
51782 CFlags, SrcCC, SetCC0.getOperand(1)})
51783 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51784 {Sub.getOperand(0), Sub.getOperand(0),
51785 CFlags, SrcCC, SetCC0.getOperand(1)});
51786
51787 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51788}
51789
51792 const X86Subtarget &Subtarget) {
51793 using namespace SDPatternMatch;
51794
51795 SDValue N0 = N->getOperand(0);
51796 SDValue N1 = N->getOperand(1);
51797 EVT VT = N->getValueType(0);
51798 SDLoc dl(N);
51799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51800
51801 // If this is SSE1 only convert to FAND to avoid scalarization.
51802 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51803 return DAG.getBitcast(MVT::v4i32,
51804 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51805 DAG.getBitcast(MVT::v4f32, N0),
51806 DAG.getBitcast(MVT::v4f32, N1)));
51807 }
51808
51809 // Use a 32-bit and+zext if upper bits known zero.
51810 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51811 APInt HiMask = APInt::getHighBitsSet(64, 32);
51812 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51813 DAG.MaskedValueIsZero(N0, HiMask)) {
51814 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51815 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51816 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51817 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51818 }
51819 }
51820
51821 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51822 // TODO: Support multiple SrcOps.
51823 if (VT == MVT::i1) {
51825 SmallVector<APInt, 2> SrcPartials;
51826 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51827 SrcOps.size() == 1) {
51828 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51829 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51830 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51831 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51832 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51833 if (Mask) {
51834 assert(SrcPartials[0].getBitWidth() == NumElts &&
51835 "Unexpected partial reduction mask");
51836 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51837 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51838 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51839 }
51840 }
51841 }
51842
51843 // InstCombine converts:
51844 // `(-x << C0) & C1`
51845 // to
51846 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51847 // This saves an IR instruction but on x86 the neg/shift version is preferable
51848 // so undo the transform.
51849
51850 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51851 // TODO: We don't actually need a splat for this, we just need the checks to
51852 // hold for each element.
51853 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51854 /*AllowTruncation*/ false);
51855 ConstantSDNode *N01C =
51856 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51857 /*AllowTruncation*/ false);
51858 if (N1C && N01C) {
51859 const APInt &MulC = N01C->getAPIntValue();
51860 const APInt &AndC = N1C->getAPIntValue();
51861 APInt MulCLowBit = MulC & (-MulC);
51862 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51863 (MulCLowBit + MulC).isPowerOf2()) {
51864 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51865 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51866 assert(MulCLowBitLog != -1 &&
51867 "Isolated lowbit is somehow not a power of 2!");
51868 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51869 DAG.getConstant(MulCLowBitLog, dl, VT));
51870 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51871 }
51872 }
51873 }
51874
51875 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51876 return SetCC;
51877
51878 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51879 return V;
51880
51881 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51882 return R;
51883
51884 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51885 return R;
51886
51887 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51888 return R;
51889
51890 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51891 DAG, DCI, Subtarget))
51892 return FPLogic;
51893
51894 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51895 return R;
51896
51897 if (DCI.isBeforeLegalizeOps())
51898 return SDValue();
51899
51900 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51901 return R;
51902
51903 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51904 return R;
51905
51906 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51907 return ShiftRight;
51908
51909 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51910 return R;
51911
51912 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51913 return R;
51914
51915 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51916 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51917 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51918 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51919 unsigned Opc0 = N0.getOpcode();
51920 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51922 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51923 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51924 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51925 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51926 }
51927 }
51928
51929 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51930 // to make use of predicated selects.
51931 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51932 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51933 SDValue X, Y;
51934 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51935 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51936 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51937 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51940 m_Value(Y), m_SpecificVT(CondVT),
51941 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51942 return DAG.getSelect(dl, VT, Y, X,
51943 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51944 }
51945 }
51946
51947 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51948 // avoids slow variable shift (moving shift amount to ECX etc.)
51949 if (isOneConstant(N1) && N0->hasOneUse()) {
51950 SDValue Src = N0;
51951 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51952 Src.getOpcode() == ISD::TRUNCATE) &&
51953 Src.getOperand(0)->hasOneUse())
51954 Src = Src.getOperand(0);
51955 bool ContainsNOT = false;
51956 X86::CondCode X86CC = X86::COND_B;
51957 // Peek through AND(NOT(SRL(X,Y)),1).
51958 if (isBitwiseNot(Src)) {
51959 Src = Src.getOperand(0);
51960 X86CC = X86::COND_AE;
51961 ContainsNOT = true;
51962 }
51963 if (Src.getOpcode() == ISD::SRL &&
51964 !isa<ConstantSDNode>(Src.getOperand(1))) {
51965 SDValue BitNo = Src.getOperand(1);
51966 Src = Src.getOperand(0);
51967 // Peek through AND(SRL(NOT(X),Y),1).
51968 if (isBitwiseNot(Src)) {
51969 Src = Src.getOperand(0);
51970 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51971 ContainsNOT = true;
51972 }
51973 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51974 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51975 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51976 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51977 }
51978 }
51979
51980 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51981 // Attempt to recursively combine a bitmask AND with shuffles.
51982 SDValue Op(N, 0);
51983 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51984 return Res;
51985
51986 // If either operand is a constant mask, then only the elements that aren't
51987 // zero are actually demanded by the other operand.
51988 auto GetDemandedMasks = [&](SDValue Op) {
51989 APInt UndefElts;
51990 SmallVector<APInt> EltBits;
51991 int NumElts = VT.getVectorNumElements();
51992 int EltSizeInBits = VT.getScalarSizeInBits();
51993 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51994 APInt DemandedElts = APInt::getAllOnes(NumElts);
51995 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51996 EltBits)) {
51997 DemandedBits.clearAllBits();
51998 DemandedElts.clearAllBits();
51999 for (int I = 0; I != NumElts; ++I) {
52000 if (UndefElts[I]) {
52001 // We can't assume an undef src element gives an undef dst - the
52002 // other src might be zero.
52003 DemandedBits.setAllBits();
52004 DemandedElts.setBit(I);
52005 } else if (!EltBits[I].isZero()) {
52006 DemandedBits |= EltBits[I];
52007 DemandedElts.setBit(I);
52008 }
52009 }
52010 }
52011 return std::make_pair(DemandedBits, DemandedElts);
52012 };
52013 APInt Bits0, Elts0;
52014 APInt Bits1, Elts1;
52015 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52016 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52017
52018 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52019 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52020 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52021 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52022 if (N->getOpcode() != ISD::DELETED_NODE)
52023 DCI.AddToWorklist(N);
52024 return SDValue(N, 0);
52025 }
52026
52027 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52028 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52029 if (NewN0 || NewN1)
52030 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52031 NewN1 ? NewN1 : N1);
52032 }
52033
52034 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52035 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52037 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52038 SDValue BitMask = N1;
52039 SDValue SrcVec = N0.getOperand(0);
52040 EVT SrcVecVT = SrcVec.getValueType();
52041
52042 // Check that the constant bitmask masks whole bytes.
52043 APInt UndefElts;
52044 SmallVector<APInt, 64> EltBits;
52045 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52046 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52047 llvm::all_of(EltBits, [](const APInt &M) {
52048 return M.isZero() || M.isAllOnes();
52049 })) {
52050 unsigned NumElts = SrcVecVT.getVectorNumElements();
52051 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52052 unsigned Idx = N0.getConstantOperandVal(1);
52053
52054 // Create a root shuffle mask from the byte mask and the extracted index.
52055 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52056 for (unsigned i = 0; i != Scale; ++i) {
52057 if (UndefElts[i])
52058 continue;
52059 int VecIdx = Scale * Idx + i;
52060 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52061 }
52062
52064 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52065 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52066 /*AllowVariableCrossLaneMask=*/true,
52067 /*AllowVariablePerLaneMask=*/true,
52068 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52069 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52070 N0.getOperand(1));
52071 }
52072 }
52073
52074 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52075 return R;
52076
52077 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52078 return R;
52079
52080 return SDValue();
52081}
52082
52083// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52085 SelectionDAG &DAG,
52086 const X86Subtarget &Subtarget) {
52087 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52088
52089 MVT VT = N->getSimpleValueType(0);
52090 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52091 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52092 return SDValue();
52093
52094 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52095 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52096 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52097 return SDValue();
52098
52099 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52100 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52101 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52102 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52103 return SDValue();
52104
52105 // Attempt to extract constant byte masks.
52106 APInt UndefElts0, UndefElts1;
52107 SmallVector<APInt, 32> EltBits0, EltBits1;
52108 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52109 /*AllowWholeUndefs*/ false,
52110 /*AllowPartialUndefs*/ false))
52111 return SDValue();
52112 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52113 /*AllowWholeUndefs*/ false,
52114 /*AllowPartialUndefs*/ false))
52115 return SDValue();
52116
52117 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52118 // TODO - add UNDEF elts support.
52119 if (UndefElts0[i] || UndefElts1[i])
52120 return SDValue();
52121 if (EltBits0[i] != ~EltBits1[i])
52122 return SDValue();
52123 }
52124
52125 if (useVPTERNLOG(Subtarget, VT)) {
52126 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52127 // VPTERNLOG is only available as vXi32/64-bit types.
52128 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52129 MVT OpVT =
52130 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52131 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52132 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52133 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52134 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52135 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52136 DAG, Subtarget);
52137 return DAG.getBitcast(VT, Res);
52138 }
52139
52140 SDValue X = N->getOperand(0);
52141 SDValue Y =
52142 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52143 DAG.getBitcast(VT, N1.getOperand(0)));
52144 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52145}
52146
52147// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52148// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52149// Waiting for ANDNP combine allows other combines to happen that prevent
52150// matching.
52151static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52152 using namespace SDPatternMatch;
52153 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52154 m_And(m_Deferred(Mask), m_Value(Y))));
52155}
52156
52157// Try to fold:
52158// (or (and (m, y), (pandn m, x)))
52159// into:
52160// (vselect m, x, y)
52161// As a special case, try to fold:
52162// (or (and (m, (sub 0, x)), (pandn m, x)))
52163// into:
52164// (sub (xor X, M), M)
52166 SelectionDAG &DAG,
52167 const X86Subtarget &Subtarget) {
52168 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52169
52170 EVT VT = N->getValueType(0);
52171 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52172 (VT.is256BitVector() && Subtarget.hasInt256())))
52173 return SDValue();
52174
52175 SDValue X, Y, Mask;
52176 if (!matchLogicBlend(N, X, Y, Mask))
52177 return SDValue();
52178
52179 // Validate that X, Y, and Mask are bitcasts, and see through them.
52180 Mask = peekThroughBitcasts(Mask);
52183
52184 EVT MaskVT = Mask.getValueType();
52185 unsigned EltBits = MaskVT.getScalarSizeInBits();
52186
52187 // TODO: Attempt to handle floating point cases as well?
52188 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52189 return SDValue();
52190
52191 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52192 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52193 DAG, Subtarget))
52194 return Res;
52195
52196 // PBLENDVB is only available on SSE 4.1.
52197 if (!Subtarget.hasSSE41())
52198 return SDValue();
52199
52200 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52201 if (Subtarget.hasVLX())
52202 return SDValue();
52203
52204 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52205
52206 X = DAG.getBitcast(BlendVT, X);
52207 Y = DAG.getBitcast(BlendVT, Y);
52208 Mask = DAG.getBitcast(BlendVT, Mask);
52209 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52210 return DAG.getBitcast(VT, Mask);
52211}
52212
52213// Helper function for combineOrCmpEqZeroToCtlzSrl
52214// Transforms:
52215// seteq(cmp x, 0)
52216// into:
52217// srl(ctlz x), log2(bitsize(x))
52218// Input pattern is checked by caller.
52220 SDValue Cmp = Op.getOperand(1);
52221 EVT VT = Cmp.getOperand(0).getValueType();
52222 unsigned Log2b = Log2_32(VT.getSizeInBits());
52223 SDLoc dl(Op);
52224 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52225 // The result of the shift is true or false, and on X86, the 32-bit
52226 // encoding of shr and lzcnt is more desirable.
52227 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52228 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52229 DAG.getConstant(Log2b, dl, MVT::i8));
52230 return Scc;
52231}
52232
52233// Try to transform:
52234// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52235// into:
52236// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52237// Will also attempt to match more generic cases, eg:
52238// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52239// Only applies if the target supports the FastLZCNT feature.
52242 const X86Subtarget &Subtarget) {
52243 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52244 return SDValue();
52245
52246 auto isORCandidate = [](SDValue N) {
52247 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52248 };
52249
52250 // Check the zero extend is extending to 32-bit or more. The code generated by
52251 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52252 // instructions to clear the upper bits.
52253 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52254 !isORCandidate(N->getOperand(0)))
52255 return SDValue();
52256
52257 // Check the node matches: setcc(eq, cmp 0)
52258 auto isSetCCCandidate = [](SDValue N) {
52259 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52260 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52261 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52262 isNullConstant(N->getOperand(1).getOperand(1)) &&
52263 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52264 };
52265
52266 SDNode *OR = N->getOperand(0).getNode();
52267 SDValue LHS = OR->getOperand(0);
52268 SDValue RHS = OR->getOperand(1);
52269
52270 // Save nodes matching or(or, setcc(eq, cmp 0)).
52272 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52273 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52274 ORNodes.push_back(OR);
52275 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52276 LHS = OR->getOperand(0);
52277 RHS = OR->getOperand(1);
52278 }
52279
52280 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52281 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52282 !isORCandidate(SDValue(OR, 0)))
52283 return SDValue();
52284
52285 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52286 // to
52287 // or(srl(ctlz),srl(ctlz)).
52288 // The dag combiner can then fold it into:
52289 // srl(or(ctlz, ctlz)).
52290 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52291 SDValue Ret, NewRHS;
52292 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52293 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52294
52295 if (!Ret)
52296 return SDValue();
52297
52298 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52299 while (!ORNodes.empty()) {
52300 OR = ORNodes.pop_back_val();
52301 LHS = OR->getOperand(0);
52302 RHS = OR->getOperand(1);
52303 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52304 if (RHS->getOpcode() == ISD::OR)
52305 std::swap(LHS, RHS);
52306 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52307 if (!NewRHS)
52308 return SDValue();
52309 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52310 }
52311
52312 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52313}
52314
52315/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52316/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52317/// with CMP+{ADC, SBB}.
52318/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52319static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52320 SDValue X, SDValue Y,
52321 SelectionDAG &DAG,
52322 bool ZeroSecondOpOnly = false) {
52323 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52324 return SDValue();
52325
52326 // Look through a one-use zext.
52327 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52328 Y = Y.getOperand(0);
52329
52330 X86::CondCode CC;
52331 SDValue EFLAGS;
52332 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52333 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52334 EFLAGS = Y.getOperand(1);
52335 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52336 Y.hasOneUse()) {
52337 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52338 }
52339
52340 if (!EFLAGS)
52341 return SDValue();
52342
52343 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52344 // the general case below.
52345 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52346 if (ConstantX && !ZeroSecondOpOnly) {
52347 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52348 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52349 // This is a complicated way to get -1 or 0 from the carry flag:
52350 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52351 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52352 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52353 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52354 EFLAGS);
52355 }
52356
52357 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52358 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52359 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52360 EFLAGS.getValueType().isInteger() &&
52361 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52362 // Swap the operands of a SUB, and we have the same pattern as above.
52363 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52364 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52365 SDValue NewSub = DAG.getNode(
52366 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52367 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52368 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52369 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52370 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52371 NewEFLAGS);
52372 }
52373 }
52374 }
52375
52376 if (CC == X86::COND_B) {
52377 // X + SETB Z --> adc X, 0
52378 // X - SETB Z --> sbb X, 0
52379 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52380 DAG.getVTList(VT, MVT::i32), X,
52381 DAG.getConstant(0, DL, VT), EFLAGS);
52382 }
52383
52384 if (ZeroSecondOpOnly)
52385 return SDValue();
52386
52387 if (CC == X86::COND_A) {
52388 // Try to convert COND_A into COND_B in an attempt to facilitate
52389 // materializing "setb reg".
52390 //
52391 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52392 // cannot take an immediate as its first operand.
52393 //
52394 // If EFLAGS is from a CMP that compares the same operands as the earlier
52395 // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
52396 // SBB/ADC without creating a flipped SUB.
52397 if (EFLAGS.getOpcode() == X86ISD::CMP &&
52398 EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
52399 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52400 DAG.getVTList(VT, MVT::i32), X,
52401 DAG.getConstant(0, DL, VT), EFLAGS);
52402 }
52403
52404 if (EFLAGS.getOpcode() == X86ISD::SUB &&
52405 EFLAGS.getValueType().isInteger() &&
52406 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52407 // Only create NewSub if we know one of the folds will succeed to avoid
52408 // introducing a temporary node that may persist and affect one-use checks
52409 // below.
52410 if (EFLAGS.getNode()->hasOneUse()) {
52411 SDValue NewSub = DAG.getNode(
52412 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52413 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52414 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52415 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52416 DAG.getVTList(VT, MVT::i32), X,
52417 DAG.getConstant(0, DL, VT), NewEFLAGS);
52418 }
52419
52420 if (IsSub && X == EFLAGS.getValue(0)) {
52421 SDValue NewSub = DAG.getNode(
52422 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52423 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52424 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52425 return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
52426 EFLAGS.getOperand(0), EFLAGS.getOperand(1),
52427 NewEFLAGS);
52428 }
52429 }
52430 }
52431
52432 if (CC == X86::COND_AE) {
52433 // X + SETAE --> sbb X, -1
52434 // X - SETAE --> adc X, -1
52435 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52436 DAG.getVTList(VT, MVT::i32), X,
52437 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52438 }
52439
52440 if (CC == X86::COND_BE) {
52441 // X + SETBE --> sbb X, -1
52442 // X - SETBE --> adc X, -1
52443 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52444 // materializing "setae reg".
52445 //
52446 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52447 // cannot take an immediate as its first operand.
52448 //
52449 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52450 EFLAGS.getValueType().isInteger() &&
52451 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52452 SDValue NewSub =
52453 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52454 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52455 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52456 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52457 DAG.getVTList(VT, MVT::i32), X,
52458 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52459 }
52460 }
52461
52462 if (CC != X86::COND_E && CC != X86::COND_NE)
52463 return SDValue();
52464
52465 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52466 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52467 !EFLAGS.getOperand(0).getValueType().isInteger())
52468 return SDValue();
52469
52470 SDValue Z = EFLAGS.getOperand(0);
52471 EVT ZVT = Z.getValueType();
52472
52473 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52474 // the general case below.
52475 if (ConstantX) {
52476 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52477 // fake operands:
52478 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52479 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52480 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52481 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52482 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52483 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52484 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52485 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52486 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52487 SDValue(Neg.getNode(), 1));
52488 }
52489
52490 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52491 // with fake operands:
52492 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52493 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52494 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52495 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52496 SDValue One = DAG.getConstant(1, DL, ZVT);
52497 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52498 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52499 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52500 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52501 Cmp1.getValue(1));
52502 }
52503 }
52504
52505 // (cmp Z, 1) sets the carry flag if Z is 0.
52506 SDValue One = DAG.getConstant(1, DL, ZVT);
52507 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52508 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52509
52510 // Add the flags type for ADC/SBB nodes.
52511 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52512
52513 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52514 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52515 if (CC == X86::COND_NE)
52516 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52517 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52518
52519 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52520 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52521 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52522 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52523}
52524
52525/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52526/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52527/// with CMP+{ADC, SBB}.
52529 SelectionDAG &DAG) {
52530 bool IsSub = N->getOpcode() == ISD::SUB;
52531 SDValue X = N->getOperand(0);
52532 SDValue Y = N->getOperand(1);
52533 EVT VT = N->getValueType(0);
52534
52535 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52536 return ADCOrSBB;
52537
52538 // Commute and try again (negate the result for subtracts).
52539 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52540 if (IsSub)
52541 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52542 return ADCOrSBB;
52543 }
52544
52545 return SDValue();
52546}
52547
52548static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52549 SDValue N0, SDValue N1,
52550 SelectionDAG &DAG) {
52551 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52552
52553 // Delegate to combineAddOrSubToADCOrSBB if we have:
52554 //
52555 // (xor/or (zero_extend (setcc)) imm)
52556 //
52557 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52558 // equivalent to a SUB/ADD, respectively.
52559 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52560 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52561 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52562 bool IsSub = Opc == ISD::XOR;
52563 bool N1COdd = N1C->getZExtValue() & 1;
52564 if (IsSub ? N1COdd : !N1COdd)
52565 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52566 return R;
52567 }
52568 }
52569
52570 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52571 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52572 N0.getOperand(0).getOpcode() == ISD::AND &&
52576 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52577 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52578 N0.getOperand(0).getOperand(1));
52579 }
52580
52581 return SDValue();
52582}
52583
52586 const X86Subtarget &Subtarget) {
52587 SDValue N0 = N->getOperand(0);
52588 SDValue N1 = N->getOperand(1);
52589 EVT VT = N->getValueType(0);
52590 SDLoc dl(N);
52591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52592
52593 // If this is SSE1 only convert to FOR to avoid scalarization.
52594 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52595 return DAG.getBitcast(MVT::v4i32,
52596 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52597 DAG.getBitcast(MVT::v4f32, N0),
52598 DAG.getBitcast(MVT::v4f32, N1)));
52599 }
52600
52601 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52602 // TODO: Support multiple SrcOps.
52603 if (VT == MVT::i1) {
52605 SmallVector<APInt, 2> SrcPartials;
52606 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52607 SrcOps.size() == 1) {
52608 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52609 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52610 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52611 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52612 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52613 if (Mask) {
52614 assert(SrcPartials[0].getBitWidth() == NumElts &&
52615 "Unexpected partial reduction mask");
52616 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52617 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52618 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52619 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52620 }
52621 }
52622 }
52623
52624 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52625 return SetCC;
52626
52627 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52628 return R;
52629
52630 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52631 return R;
52632
52633 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52634 return R;
52635
52636 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52637 DAG, DCI, Subtarget))
52638 return FPLogic;
52639
52640 if (DCI.isBeforeLegalizeOps())
52641 return SDValue();
52642
52643 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52644 return R;
52645
52646 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52647 return R;
52648
52649 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52650 return R;
52651
52652 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52653 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52654 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52655 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52656 uint64_t Val = CN->getZExtValue();
52657 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52658 Val == 8) {
52659 SDValue NotCond;
52660 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52661 N0.getOperand(1).hasOneUse()) {
52664 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52665 } else if (N0.getOpcode() == ISD::SUB &&
52666 isNullConstant(N0.getOperand(0))) {
52667 SDValue Cond = N0.getOperand(1);
52668 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52669 Cond = Cond.getOperand(0);
52670 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52671 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52673 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52674 }
52675 }
52676
52677 if (NotCond) {
52678 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52679 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52680 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52681 return R;
52682 }
52683 }
52684 }
52685 }
52686
52687 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52688 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52689 // iff the upper elements of the non-shifted arg are zero.
52690 // KUNPCK require 16+ bool vector elements.
52691 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52692 unsigned NumElts = VT.getVectorNumElements();
52693 unsigned HalfElts = NumElts / 2;
52694 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52695 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52696 N1.getConstantOperandAPInt(1) == HalfElts &&
52697 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52698 return DAG.getNode(
52699 ISD::CONCAT_VECTORS, dl, VT,
52700 extractSubVector(N0, 0, DAG, dl, HalfElts),
52701 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52702 }
52703 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52704 N0.getConstantOperandAPInt(1) == HalfElts &&
52705 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52706 return DAG.getNode(
52707 ISD::CONCAT_VECTORS, dl, VT,
52708 extractSubVector(N1, 0, DAG, dl, HalfElts),
52709 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52710 }
52711 }
52712
52713 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52714 // Attempt to recursively combine an OR of shuffles.
52715 SDValue Op(N, 0);
52716 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52717 return Res;
52718
52719 // If either operand is a constant mask, then only the elements that aren't
52720 // allones are actually demanded by the other operand.
52721 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52722 APInt UndefElts;
52723 SmallVector<APInt> EltBits;
52724 int NumElts = VT.getVectorNumElements();
52725 int EltSizeInBits = VT.getScalarSizeInBits();
52726 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52727 return false;
52728
52729 APInt DemandedElts = APInt::getZero(NumElts);
52730 for (int I = 0; I != NumElts; ++I)
52731 if (!EltBits[I].isAllOnes())
52732 DemandedElts.setBit(I);
52733
52734 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52735 };
52736 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52737 if (N->getOpcode() != ISD::DELETED_NODE)
52738 DCI.AddToWorklist(N);
52739 return SDValue(N, 0);
52740 }
52741 }
52742
52743 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52744 return R;
52745
52746 return SDValue();
52747}
52748
52749/// Try to turn tests against the signbit in the form of:
52750/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52751/// into:
52752/// SETGT(X, -1)
52754 SelectionDAG &DAG) {
52755 // This is only worth doing if the output type is i8 or i1.
52756 EVT ResultType = N->getValueType(0);
52757 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52758 return SDValue();
52759
52760 SDValue N0 = N->getOperand(0);
52761 SDValue N1 = N->getOperand(1);
52762
52763 // We should be performing an xor against a truncated shift.
52764 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52765 return SDValue();
52766
52767 // Make sure we are performing an xor against one.
52768 if (!isOneConstant(N1))
52769 return SDValue();
52770
52771 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52772 SDValue Shift = N0.getOperand(0);
52773 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52774 return SDValue();
52775
52776 // Make sure we are truncating from one of i16, i32 or i64.
52777 EVT ShiftTy = Shift.getValueType();
52778 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52779 return SDValue();
52780
52781 // Make sure the shift amount extracts the sign bit.
52782 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52783 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52784 return SDValue();
52785
52786 // Create a greater-than comparison against -1.
52787 // N.B. Using SETGE against 0 works but we want a canonical looking
52788 // comparison, using SETGT matches up with what TranslateX86CC.
52789 SDValue ShiftOp = Shift.getOperand(0);
52790 EVT ShiftOpTy = ShiftOp.getValueType();
52791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52792 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52793 *DAG.getContext(), ResultType);
52794 SDValue Cond =
52795 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52796 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52797 if (SetCCResultType != ResultType)
52798 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52799 return Cond;
52800}
52801
52802/// Turn vector tests of the signbit in the form of:
52803/// xor (sra X, elt_size(X)-1), -1
52804/// into:
52805/// pcmpgt X, -1
52806///
52807/// This should be called before type legalization because the pattern may not
52808/// persist after that.
52810 const X86Subtarget &Subtarget) {
52811 EVT VT = N->getValueType(0);
52812 if (!VT.isSimple())
52813 return SDValue();
52814
52815 switch (VT.getSimpleVT().SimpleTy) {
52816 // clang-format off
52817 default: return SDValue();
52818 case MVT::v16i8:
52819 case MVT::v8i16:
52820 case MVT::v4i32:
52821 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52822 case MVT::v32i8:
52823 case MVT::v16i16:
52824 case MVT::v8i32:
52825 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52826 // clang-format on
52827 }
52828
52829 // There must be a shift right algebraic before the xor, and the xor must be a
52830 // 'not' operation.
52831 SDValue Shift = N->getOperand(0);
52832 SDValue Ones = N->getOperand(1);
52833 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52835 return SDValue();
52836
52837 // The shift should be smearing the sign bit across each vector element.
52838 auto *ShiftAmt =
52839 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52840 if (!ShiftAmt ||
52841 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52842 return SDValue();
52843
52844 // Create a greater-than comparison against -1. We don't use the more obvious
52845 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52846 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52847}
52848
52849/// Detect patterns of truncation with unsigned saturation:
52850///
52851/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52852/// Return the source value x to be truncated or SDValue() if the pattern was
52853/// not matched.
52854///
52855/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52856/// where C1 >= 0 and C2 is unsigned max of destination type.
52857///
52858/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52859/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52860///
52861/// These two patterns are equivalent to:
52862/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52863/// So return the smax(x, C1) value to be truncated or SDValue() if the
52864/// pattern was not matched.
52866 const SDLoc &DL) {
52867 using namespace llvm::SDPatternMatch;
52868 EVT InVT = In.getValueType();
52869
52870 // Saturation with truncation. We truncate from InVT to VT.
52872 "Unexpected types for truncate operation");
52873
52874 APInt C1, C2;
52876
52877 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52878 // the element size of the destination type.
52879 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52880 C2.isMask(VT.getScalarSizeInBits()))
52881 return UMin;
52882
52883 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52885 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52886 return SMin;
52887
52888 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52890 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52891 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52892
52893 return SDValue();
52894}
52895
52896/// Detect patterns of truncation with signed saturation:
52897/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52898/// signed_max_of_dest_type)) to dest_type)
52899/// or:
52900/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52901/// signed_min_of_dest_type)) to dest_type).
52902/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52903/// Return the source value to be truncated or SDValue() if the pattern was not
52904/// matched.
52905static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52906 using namespace llvm::SDPatternMatch;
52907 unsigned NumDstBits = VT.getScalarSizeInBits();
52908 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52909 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52910
52911 APInt SignedMax, SignedMin;
52912 if (MatchPackUS) {
52913 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52914 SignedMin = APInt::getZero(NumSrcBits);
52915 } else {
52916 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52917 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52918 }
52919
52920 SDValue SMin, SMax;
52921 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52922 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52923 return SMax;
52924
52925 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52926 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52927 return SMin;
52928
52929 return SDValue();
52930}
52931
52933 SelectionDAG &DAG,
52934 const X86Subtarget &Subtarget) {
52935 if (!Subtarget.hasSSE2() || !VT.isVector())
52936 return SDValue();
52937
52938 EVT SVT = VT.getVectorElementType();
52939 EVT InVT = In.getValueType();
52940 EVT InSVT = InVT.getVectorElementType();
52941
52942 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52943 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52944 // and concatenate at the same time. Then we can use a final vpmovuswb to
52945 // clip to 0-255.
52946 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52947 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52948 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52949 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52950 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52951 DL, DAG, Subtarget);
52952 assert(Mid && "Failed to pack!");
52953 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52954 }
52955 }
52956
52957 // vXi32 truncate instructions are available with AVX512F.
52958 // vXi16 truncate instructions are only available with AVX512BW.
52959 // For 256-bit or smaller vectors, we require VLX.
52960 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52961 // If the result type is 256-bits or larger and we have disable 512-bit
52962 // registers, we should go ahead and use the pack instructions if possible.
52963 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52964 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52965 (InVT.getSizeInBits() > 128) &&
52966 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52967 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52968
52969 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52971 (SVT == MVT::i8 || SVT == MVT::i16) &&
52972 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52973 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52974 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52975 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52976 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52977 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52978 DAG, Subtarget);
52979 assert(Mid && "Failed to pack!");
52981 Subtarget);
52982 assert(V && "Failed to pack!");
52983 return V;
52984 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52985 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52986 Subtarget);
52987 }
52988 if (SDValue SSatVal = detectSSatPattern(In, VT))
52989 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52990 Subtarget);
52991 }
52992
52993 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52994 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52995 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52996 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52997 unsigned TruncOpc = 0;
52998 SDValue SatVal;
52999 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
53000 SatVal = SSatVal;
53001 TruncOpc = X86ISD::VTRUNCS;
53002 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
53003 SatVal = USatVal;
53004 TruncOpc = X86ISD::VTRUNCUS;
53005 }
53006 if (SatVal) {
53007 unsigned ResElts = VT.getVectorNumElements();
53008 // If the input type is less than 512 bits and we don't have VLX, we need
53009 // to widen to 512 bits.
53010 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
53011 unsigned NumConcats = 512 / InVT.getSizeInBits();
53012 ResElts *= NumConcats;
53013 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
53014 ConcatOps[0] = SatVal;
53015 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
53016 NumConcats * InVT.getVectorNumElements());
53017 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
53018 }
53019 // Widen the result if its narrower than 128 bits.
53020 if (ResElts * SVT.getSizeInBits() < 128)
53021 ResElts = 128 / SVT.getSizeInBits();
53022 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
53023 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
53024 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
53025 DAG.getVectorIdxConstant(0, DL));
53026 }
53027 }
53028
53029 return SDValue();
53030}
53031
53033 SelectionDAG &DAG,
53035 const X86Subtarget &Subtarget) {
53036 auto *Ld = cast<LoadSDNode>(N);
53037 EVT RegVT = Ld->getValueType(0);
53038 SDValue Ptr = Ld->getBasePtr();
53039 SDValue Chain = Ld->getChain();
53040 ISD::LoadExtType Ext = Ld->getExtensionType();
53041
53042 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53043 return SDValue();
53044
53045 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53046 return SDValue();
53047
53049 if (!LdC)
53050 return SDValue();
53051
53052 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53053 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53054 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53055 if (Undefs[I])
53056 continue;
53057 if (UserUndefs[I] || Bits[I] != UserBits[I])
53058 return false;
53059 }
53060 return true;
53061 };
53062
53063 // Look through all other loads/broadcasts in the chain for another constant
53064 // pool entry.
53065 for (SDNode *User : Chain->users()) {
53066 auto *UserLd = dyn_cast<MemSDNode>(User);
53067 if (User != N && UserLd &&
53068 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53069 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53071 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53072 User->getValueSizeInBits(0).getFixedValue() >
53073 RegVT.getFixedSizeInBits()) {
53074 EVT UserVT = User->getValueType(0);
53075 SDValue UserPtr = UserLd->getBasePtr();
53076 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53077
53078 // See if we are loading a constant that matches in the lower
53079 // bits of a longer constant (but from a different constant pool ptr).
53080 if (UserC && UserPtr != Ptr) {
53081 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53082 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53083 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53084 APInt Undefs, UserUndefs;
53085 SmallVector<APInt> Bits, UserBits;
53086 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53087 UserVT.getScalarSizeInBits());
53088 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53089 Bits) &&
53091 UserUndefs, UserBits)) {
53092 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53094 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53095 RegVT.getSizeInBits());
53096 Extract = DAG.getBitcast(RegVT, Extract);
53097 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53098 }
53099 }
53100 }
53101 }
53102 }
53103 }
53104
53105 return SDValue();
53106}
53107
53110 const X86Subtarget &Subtarget) {
53111 auto *Ld = cast<LoadSDNode>(N);
53112 EVT RegVT = Ld->getValueType(0);
53113 EVT MemVT = Ld->getMemoryVT();
53114 SDLoc dl(Ld);
53115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53116
53117 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53118 // into two 16-byte operations. Also split non-temporal aligned loads on
53119 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53120 ISD::LoadExtType Ext = Ld->getExtensionType();
53121 unsigned Fast;
53122 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53123 Ext == ISD::NON_EXTLOAD &&
53124 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53125 Ld->getAlign() >= Align(16)) ||
53126 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53127 *Ld->getMemOperand(), &Fast) &&
53128 !Fast))) {
53129 unsigned NumElems = RegVT.getVectorNumElements();
53130 if (NumElems < 2)
53131 return SDValue();
53132
53133 unsigned HalfOffset = 16;
53134 SDValue Ptr1 = Ld->getBasePtr();
53135 SDValue Ptr2 =
53136 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53137 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53138 NumElems / 2);
53139 SDValue Load1 =
53140 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53141 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53142 SDValue Load2 =
53143 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53144 Ld->getPointerInfo().getWithOffset(HalfOffset),
53145 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53146 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53147 Load1.getValue(1), Load2.getValue(1));
53148
53149 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53150 return DCI.CombineTo(N, NewVec, TF, true);
53151 }
53152
53153 // Bool vector load - attempt to cast to an integer, as we have good
53154 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53155 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53156 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53157 unsigned NumElts = RegVT.getVectorNumElements();
53158 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53159 if (TLI.isTypeLegal(IntVT)) {
53160 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53161 Ld->getPointerInfo(), Ld->getBaseAlign(),
53162 Ld->getMemOperand()->getFlags());
53163 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53164 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53165 }
53166 }
53167
53168 // If we also broadcast this vector to a wider type, then just extract the
53169 // lowest subvector.
53170 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53171 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53172 SDValue Ptr = Ld->getBasePtr();
53173 SDValue Chain = Ld->getChain();
53174 for (SDNode *User : Chain->users()) {
53175 auto *UserLd = dyn_cast<MemSDNode>(User);
53176 if (User != N && UserLd &&
53177 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53178 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53179 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53180 User->hasAnyUseOfValue(0) &&
53181 User->getValueSizeInBits(0).getFixedValue() >
53182 RegVT.getFixedSizeInBits()) {
53184 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53185 RegVT.getSizeInBits());
53186 Extract = DAG.getBitcast(RegVT, Extract);
53187 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53188 }
53189 }
53190 }
53191
53192 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53193 return V;
53194
53195 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53196 unsigned AddrSpace = Ld->getAddressSpace();
53197 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53198 AddrSpace == X86AS::PTR32_UPTR) {
53199 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53200 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53201 SDValue Cast =
53202 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53203 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53204 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53205 Ld->getMemOperand()->getFlags());
53206 }
53207 }
53208
53209 return SDValue();
53210}
53211
53212/// If V is a build vector of boolean constants and exactly one of those
53213/// constants is true, return the operand index of that true element.
53214/// Otherwise, return -1.
53215static int getOneTrueElt(SDValue V) {
53216 // This needs to be a build vector of booleans.
53217 // TODO: Checking for the i1 type matches the IR definition for the mask,
53218 // but the mask check could be loosened to i8 or other types. That might
53219 // also require checking more than 'allOnesValue'; eg, the x86 HW
53220 // instructions only require that the MSB is set for each mask element.
53221 // The ISD::MSTORE comments/definition do not specify how the mask operand
53222 // is formatted.
53223 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53224 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53225 return -1;
53226
53227 int TrueIndex = -1;
53228 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53229 for (unsigned i = 0; i < NumElts; ++i) {
53230 const SDValue &Op = BV->getOperand(i);
53231 if (Op.isUndef())
53232 continue;
53233 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53234 if (!ConstNode)
53235 return -1;
53236 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53237 // If we already found a one, this is too many.
53238 if (TrueIndex >= 0)
53239 return -1;
53240 TrueIndex = i;
53241 }
53242 }
53243 return TrueIndex;
53244}
53245
53246/// Given a masked memory load/store operation, return true if it has one mask
53247/// bit set. If it has one mask bit set, then also return the memory address of
53248/// the scalar element to load/store, the vector index to insert/extract that
53249/// scalar element, and the alignment for the scalar memory access.
53251 SelectionDAG &DAG, SDValue &Addr,
53252 SDValue &Index, Align &Alignment,
53253 unsigned &Offset) {
53254 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53255 if (TrueMaskElt < 0)
53256 return false;
53257
53258 // Get the address of the one scalar element that is specified by the mask
53259 // using the appropriate offset from the base pointer.
53260 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53261 Offset = 0;
53262 Addr = MaskedOp->getBasePtr();
53263 if (TrueMaskElt != 0) {
53264 Offset = TrueMaskElt * EltVT.getStoreSize();
53266 SDLoc(MaskedOp));
53267 }
53268
53269 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53270 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53271 return true;
53272}
53273
53274/// If exactly one element of the mask is set for a non-extending masked load,
53275/// it is a scalar load and vector insert.
53276/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53277/// mask have already been optimized in IR, so we don't bother with those here.
53278static SDValue
53281 const X86Subtarget &Subtarget) {
53282 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53283 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53284 // However, some target hooks may need to be added to know when the transform
53285 // is profitable. Endianness would also have to be considered.
53286
53287 SDValue Addr, VecIndex;
53288 Align Alignment;
53289 unsigned Offset;
53290 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53291 return SDValue();
53292
53293 // Load the one scalar element that is specified by the mask using the
53294 // appropriate offset from the base pointer.
53295 SDLoc DL(ML);
53296 EVT VT = ML->getValueType(0);
53297 EVT EltVT = VT.getVectorElementType();
53298
53299 EVT CastVT = VT;
53300 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53301 EltVT = MVT::f64;
53302 CastVT = VT.changeVectorElementType(EltVT);
53303 }
53304
53305 SDValue Load =
53306 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53307 ML->getPointerInfo().getWithOffset(Offset),
53308 Alignment, ML->getMemOperand()->getFlags());
53309
53310 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53311
53312 // Insert the loaded element into the appropriate place in the vector.
53313 SDValue Insert =
53314 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53315 Insert = DAG.getBitcast(VT, Insert);
53316 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53317}
53318
53319static SDValue
53322 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53323 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53324 return SDValue();
53325
53326 SDLoc DL(ML);
53327 EVT VT = ML->getValueType(0);
53328
53329 // If we are loading the first and last elements of a vector, it is safe and
53330 // always faster to load the whole vector. Replace the masked load with a
53331 // vector load and select.
53332 unsigned NumElts = VT.getVectorNumElements();
53333 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53334 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53335 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53336 if (LoadFirstElt && LoadLastElt) {
53337 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53338 ML->getMemOperand());
53339 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53340 ML->getPassThru());
53341 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53342 }
53343
53344 // Convert a masked load with a constant mask into a masked load and a select.
53345 // This allows the select operation to use a faster kind of select instruction
53346 // (for example, vblendvps -> vblendps).
53347
53348 // Don't try this if the pass-through operand is already undefined. That would
53349 // cause an infinite loop because that's what we're about to create.
53350 if (ML->getPassThru().isUndef())
53351 return SDValue();
53352
53353 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53354 return SDValue();
53355
53356 // The new masked load has an undef pass-through operand. The select uses the
53357 // original pass-through operand.
53358 SDValue NewML = DAG.getMaskedLoad(
53359 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53360 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53361 ML->getAddressingMode(), ML->getExtensionType());
53362 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53363 ML->getPassThru());
53364
53365 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53366}
53367
53370 const X86Subtarget &Subtarget) {
53371 auto *Mld = cast<MaskedLoadSDNode>(N);
53372
53373 // TODO: Expanding load with constant mask may be optimized as well.
53374 if (Mld->isExpandingLoad())
53375 return SDValue();
53376
53377 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53378 if (SDValue ScalarLoad =
53379 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53380 return ScalarLoad;
53381
53382 // TODO: Do some AVX512 subsets benefit from this transform?
53383 if (!Subtarget.hasAVX512())
53384 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53385 return Blend;
53386 }
53387
53388 // If the mask value has been legalized to a non-boolean vector, try to
53389 // simplify ops leading up to it. We only demand the MSB of each lane.
53390 SDValue Mask = Mld->getMask();
53391 if (Mask.getScalarValueSizeInBits() != 1) {
53392 EVT VT = Mld->getValueType(0);
53393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53395 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53396 if (N->getOpcode() != ISD::DELETED_NODE)
53397 DCI.AddToWorklist(N);
53398 return SDValue(N, 0);
53399 }
53400 if (SDValue NewMask =
53402 return DAG.getMaskedLoad(
53403 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53404 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53405 Mld->getAddressingMode(), Mld->getExtensionType());
53406 }
53407
53408 return SDValue();
53409}
53410
53411/// If exactly one element of the mask is set for a non-truncating masked store,
53412/// it is a vector extract and scalar store.
53413/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53414/// mask have already been optimized in IR, so we don't bother with those here.
53416 SelectionDAG &DAG,
53417 const X86Subtarget &Subtarget) {
53418 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53419 // However, some target hooks may need to be added to know when the transform
53420 // is profitable. Endianness would also have to be considered.
53421
53422 SDValue Addr, VecIndex;
53423 Align Alignment;
53424 unsigned Offset;
53425 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53426 return SDValue();
53427
53428 // Extract the one scalar element that is actually being stored.
53429 SDLoc DL(MS);
53430 SDValue Value = MS->getValue();
53431 EVT VT = Value.getValueType();
53432 EVT EltVT = VT.getVectorElementType();
53433 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53434 EltVT = MVT::f64;
53435 EVT CastVT = VT.changeVectorElementType(EltVT);
53436 Value = DAG.getBitcast(CastVT, Value);
53437 }
53438 SDValue Extract =
53439 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53440
53441 // Store that element at the appropriate offset from the base pointer.
53442 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53444 Alignment, MS->getMemOperand()->getFlags());
53445}
53446
53449 const X86Subtarget &Subtarget) {
53451 if (Mst->isCompressingStore())
53452 return SDValue();
53453
53454 EVT VT = Mst->getValue().getValueType();
53455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53456
53457 if (Mst->isTruncatingStore())
53458 return SDValue();
53459
53460 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53461 return ScalarStore;
53462
53463 // If the mask value has been legalized to a non-boolean vector, try to
53464 // simplify ops leading up to it. We only demand the MSB of each lane.
53465 SDValue Mask = Mst->getMask();
53466 if (Mask.getScalarValueSizeInBits() != 1) {
53468 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53469 if (N->getOpcode() != ISD::DELETED_NODE)
53470 DCI.AddToWorklist(N);
53471 return SDValue(N, 0);
53472 }
53473 if (SDValue NewMask =
53475 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53476 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53477 Mst->getMemoryVT(), Mst->getMemOperand(),
53478 Mst->getAddressingMode());
53479 }
53480
53481 SDValue Value = Mst->getValue();
53482 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53483 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53484 Mst->getMemoryVT())) {
53485 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53486 Mst->getBasePtr(), Mst->getOffset(), Mask,
53487 Mst->getMemoryVT(), Mst->getMemOperand(),
53488 Mst->getAddressingMode(), true);
53489 }
53490
53491 return SDValue();
53492}
53493
53496 const X86Subtarget &Subtarget) {
53498 EVT StVT = St->getMemoryVT();
53499 SDLoc dl(St);
53500 SDValue StoredVal = St->getValue();
53501 EVT VT = StoredVal.getValueType();
53502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53503
53504 // Convert a store of vXi1 into a store of iX and a bitcast.
53505 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53506 VT.getVectorElementType() == MVT::i1) {
53507
53509 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53510
53511 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53512 St->getPointerInfo(), St->getBaseAlign(),
53513 St->getMemOperand()->getFlags());
53514 }
53515
53516 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53517 // This will avoid a copy to k-register.
53518 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53519 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53520 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53521 SDValue Val = StoredVal.getOperand(0);
53522 // We must store zeros to the unused bits.
53523 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53524 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53525 St->getPointerInfo(), St->getBaseAlign(),
53526 St->getMemOperand()->getFlags());
53527 }
53528
53529 // Widen v2i1/v4i1 stores to v8i1.
53530 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53531 Subtarget.hasAVX512()) {
53532 unsigned NumConcats = 8 / VT.getVectorNumElements();
53533 // We must store zeros to the unused bits.
53534 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53535 Ops[0] = StoredVal;
53536 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53537 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53538 St->getPointerInfo(), St->getBaseAlign(),
53539 St->getMemOperand()->getFlags());
53540 }
53541
53542 // Turn vXi1 stores of constants into a scalar store.
53543 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53544 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53546 // If its a v64i1 store without 64-bit support, we need two stores.
53547 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53548 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53549 StoredVal->ops().slice(0, 32));
53551 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53552 StoredVal->ops().slice(32, 32));
53554
53555 SDValue Ptr0 = St->getBasePtr();
53556 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53557
53558 SDValue Ch0 =
53559 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53560 St->getBaseAlign(), St->getMemOperand()->getFlags());
53561 SDValue Ch1 = DAG.getStore(
53562 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53563 St->getBaseAlign(), St->getMemOperand()->getFlags());
53564 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53565 }
53566
53567 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53568 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53569 St->getPointerInfo(), St->getBaseAlign(),
53570 St->getMemOperand()->getFlags());
53571 }
53572
53573 // Convert scalar fabs/fneg load-store to integer equivalents.
53574 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53575 (StoredVal.getOpcode() == ISD::FABS ||
53576 StoredVal.getOpcode() == ISD::FNEG) &&
53577 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53578 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53579 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53580 if (TLI.isTypeLegal(IntVT)) {
53582 unsigned SignOp = ISD::XOR;
53583 if (StoredVal.getOpcode() == ISD::FABS) {
53584 SignMask = ~SignMask;
53585 SignOp = ISD::AND;
53586 }
53587 SDValue LogicOp = DAG.getNode(
53588 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53589 DAG.getConstant(SignMask, dl, IntVT));
53590 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53591 St->getPointerInfo(), St->getBaseAlign(),
53592 St->getMemOperand()->getFlags());
53593 }
53594 }
53595
53596 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53597 // Sandy Bridge, perform two 16-byte stores.
53598 unsigned Fast;
53599 if (VT.is256BitVector() && StVT == VT &&
53600 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53601 *St->getMemOperand(), &Fast) &&
53602 !Fast) {
53603 unsigned NumElems = VT.getVectorNumElements();
53604 if (NumElems < 2)
53605 return SDValue();
53606
53607 return splitVectorStore(St, DAG);
53608 }
53609
53610 // Split under-aligned vector non-temporal stores.
53611 if (St->isNonTemporal() && StVT == VT &&
53612 St->getAlign().value() < VT.getStoreSize()) {
53613 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53614 // vectors or the legalizer can scalarize it to use MOVNTI.
53615 if (VT.is256BitVector() || VT.is512BitVector()) {
53616 unsigned NumElems = VT.getVectorNumElements();
53617 if (NumElems < 2)
53618 return SDValue();
53619 return splitVectorStore(St, DAG);
53620 }
53621
53622 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53623 // to use MOVNTI.
53624 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53625 MVT NTVT = Subtarget.hasSSE4A()
53626 ? MVT::v2f64
53627 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53628 return scalarizeVectorStore(St, NTVT, DAG);
53629 }
53630 }
53631
53632 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53633 // supported, but avx512f is by extending to v16i32 and truncating.
53634 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53635 St->getValue().getOpcode() == ISD::TRUNCATE &&
53636 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53637 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53638 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53639 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53640 St->getValue().getOperand(0));
53641 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53642 MVT::v16i8, St->getMemOperand());
53643 }
53644
53645 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53646 if (!St->isTruncatingStore() &&
53647 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53648 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53649 StoredVal.hasOneUse() &&
53650 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53651 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53652 return EmitTruncSStore(IsSigned, St->getChain(),
53653 dl, StoredVal.getOperand(0), St->getBasePtr(),
53654 VT, St->getMemOperand(), DAG);
53655 }
53656
53657 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53658 if (!St->isTruncatingStore()) {
53659 auto IsExtractedElement = [](SDValue V) {
53660 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53661 V = V.getOperand(0);
53662 unsigned Opc = V.getOpcode();
53664 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53665 V.getOperand(0).hasOneUse())
53666 return V.getOperand(0);
53667 return SDValue();
53668 };
53669 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53670 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53671 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53672 SDValue Src = Trunc.getOperand(0);
53673 MVT DstVT = Trunc.getSimpleValueType();
53674 MVT SrcVT = Src.getSimpleValueType();
53675 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53676 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53677 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53678 if (NumTruncBits == VT.getSizeInBits() &&
53679 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53680 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53681 TruncVT, St->getMemOperand());
53682 }
53683 }
53684 }
53685 }
53686
53687 // Optimize trunc store (of multiple scalars) to shuffle and store.
53688 // First, pack all of the elements in one place. Next, store to memory
53689 // in fewer chunks.
53690 if (St->isTruncatingStore() && VT.isVector()) {
53691 if (TLI.isTruncStoreLegal(VT, StVT)) {
53692 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53693 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53694 dl, Val, St->getBasePtr(),
53695 St->getMemoryVT(), St->getMemOperand(), DAG);
53696 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53697 DAG, dl))
53698 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53699 dl, Val, St->getBasePtr(),
53700 St->getMemoryVT(), St->getMemOperand(), DAG);
53701 }
53702
53703 return SDValue();
53704 }
53705
53706 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53707 unsigned AddrSpace = St->getAddressSpace();
53708 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53709 AddrSpace == X86AS::PTR32_UPTR) {
53710 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53711 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53712 SDValue Cast =
53713 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53714 return DAG.getTruncStore(
53715 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53716 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53717 }
53718 }
53719
53720 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53721 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53722 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53723 Subtarget.hasCF() && St->isSimple()) {
53724 SDValue Cmov;
53725 if (StoredVal.getOpcode() == X86ISD::CMOV)
53726 Cmov = StoredVal;
53727 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53728 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53729 Cmov = StoredVal.getOperand(0);
53730 else
53731 return SDValue();
53732
53733 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53734 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53735 return SDValue();
53736
53737 bool InvertCC = false;
53738 SDValue V = SDValue(Ld, 0);
53739 if (V == Cmov.getOperand(1))
53740 InvertCC = true;
53741 else if (V != Cmov.getOperand(0))
53742 return SDValue();
53743
53744 SDVTList Tys = DAG.getVTList(MVT::Other);
53745 SDValue CC = Cmov.getOperand(2);
53746 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53747 if (InvertCC)
53748 CC = DAG.getTargetConstant(
53751 dl, MVT::i8);
53752 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53753 Cmov.getOperand(3)};
53754 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53755 St->getMemOperand());
53756 }
53757
53758 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53759 // the FP state in cases where an emms may be missing.
53760 // A preferable solution to the general problem is to figure out the right
53761 // places to insert EMMS. This qualifies as a quick hack.
53762
53763 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53764 if (VT.getSizeInBits() != 64)
53765 return SDValue();
53766
53767 const Function &F = DAG.getMachineFunction().getFunction();
53768 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53769 bool F64IsLegal =
53770 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53771
53772 if (!F64IsLegal || Subtarget.is64Bit())
53773 return SDValue();
53774
53775 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53776 cast<LoadSDNode>(St->getValue())->isSimple() &&
53777 St->getChain().hasOneUse() && St->isSimple()) {
53778 auto *Ld = cast<LoadSDNode>(St->getValue());
53779
53780 if (!ISD::isNormalLoad(Ld))
53781 return SDValue();
53782
53783 // Avoid the transformation if there are multiple uses of the loaded value.
53784 if (!Ld->hasNUsesOfValue(1, 0))
53785 return SDValue();
53786
53787 SDLoc LdDL(Ld);
53788 SDLoc StDL(N);
53789
53790 // Remove any range metadata as we're converting to f64 load/store.
53791 Ld->getMemOperand()->clearRanges();
53792
53793 // Lower to a single movq load/store pair.
53794 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53795 Ld->getBasePtr(), Ld->getMemOperand());
53796
53797 // Make sure new load is placed in same chain order.
53798 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53799 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53800 St->getMemOperand());
53801 }
53802
53803 // This is similar to the above case, but here we handle a scalar 64-bit
53804 // integer store that is extracted from a vector on a 32-bit target.
53805 // If we have SSE2, then we can treat it like a floating-point double
53806 // to get past legalization. The execution dependencies fixup pass will
53807 // choose the optimal machine instruction for the store if this really is
53808 // an integer or v2f32 rather than an f64.
53809 if (VT == MVT::i64 &&
53811 SDValue OldExtract = St->getOperand(1);
53812 SDValue ExtOp0 = OldExtract.getOperand(0);
53813 unsigned VecSize = ExtOp0.getValueSizeInBits();
53814 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53815 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53816 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53817 BitCast, OldExtract.getOperand(1));
53818 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53819 St->getPointerInfo(), St->getBaseAlign(),
53820 St->getMemOperand()->getFlags());
53821 }
53822
53823 return SDValue();
53824}
53825
53828 const X86Subtarget &Subtarget) {
53829 auto *St = cast<MemIntrinsicSDNode>(N);
53830
53831 SDValue StoredVal = N->getOperand(1);
53832 MVT VT = StoredVal.getSimpleValueType();
53833 EVT MemVT = St->getMemoryVT();
53834
53835 // Figure out which elements we demand.
53836 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53837 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53838
53839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53840 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53841 if (N->getOpcode() != ISD::DELETED_NODE)
53842 DCI.AddToWorklist(N);
53843 return SDValue(N, 0);
53844 }
53845
53846 return SDValue();
53847}
53848
53849/// Return 'true' if this vector operation is "horizontal"
53850/// and return the operands for the horizontal operation in LHS and RHS. A
53851/// horizontal operation performs the binary operation on successive elements
53852/// of its first operand, then on successive elements of its second operand,
53853/// returning the resulting values in a vector. For example, if
53854/// A = < float a0, float a1, float a2, float a3 >
53855/// and
53856/// B = < float b0, float b1, float b2, float b3 >
53857/// then the result of doing a horizontal operation on A and B is
53858/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53859/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53860/// A horizontal-op B, for some already available A and B, and if so then LHS is
53861/// set to A, RHS to B, and the routine returns 'true'.
53862static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53863 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53864 bool IsCommutative,
53865 SmallVectorImpl<int> &PostShuffleMask,
53866 bool ForceHorizOp) {
53867 // If either operand is undef, bail out. The binop should be simplified.
53868 if (LHS.isUndef() || RHS.isUndef())
53869 return false;
53870
53871 // Look for the following pattern:
53872 // A = < float a0, float a1, float a2, float a3 >
53873 // B = < float b0, float b1, float b2, float b3 >
53874 // and
53875 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53876 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53877 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53878 // which is A horizontal-op B.
53879
53880 MVT VT = LHS.getSimpleValueType();
53881 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53882 "Unsupported vector type for horizontal add/sub");
53883 unsigned NumElts = VT.getVectorNumElements();
53884
53885 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53886 SmallVectorImpl<int> &ShuffleMask) {
53887 bool UseSubVector = false;
53888 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53889 Op.getOperand(0).getValueType().is256BitVector() &&
53890 llvm::isNullConstant(Op.getOperand(1))) {
53891 Op = Op.getOperand(0);
53892 UseSubVector = true;
53893 }
53895 SmallVector<int, 16> SrcMask, ScaledMask;
53897 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53898 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53899 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53900 })) {
53901 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53902 if (!UseSubVector && SrcOps.size() <= 2 &&
53903 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53904 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53905 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53906 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53907 }
53908 if (UseSubVector && SrcOps.size() == 1 &&
53909 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53910 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53911 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53912 ShuffleMask.assign(Mask.begin(), Mask.end());
53913 }
53914 }
53915 };
53916
53917 // View LHS in the form
53918 // LHS = VECTOR_SHUFFLE A, B, LMask
53919 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53920 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53921 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53922 SDValue A, B;
53924 GetShuffle(LHS, A, B, LMask);
53925
53926 // Likewise, view RHS in the form
53927 // RHS = VECTOR_SHUFFLE C, D, RMask
53928 SDValue C, D;
53930 GetShuffle(RHS, C, D, RMask);
53931
53932 // At least one of the operands should be a vector shuffle.
53933 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53934 if (NumShuffles == 0)
53935 return false;
53936
53937 if (LMask.empty()) {
53938 A = LHS;
53939 for (unsigned i = 0; i != NumElts; ++i)
53940 LMask.push_back(i);
53941 }
53942
53943 if (RMask.empty()) {
53944 C = RHS;
53945 for (unsigned i = 0; i != NumElts; ++i)
53946 RMask.push_back(i);
53947 }
53948
53949 // If we have an unary mask, ensure the other op is set to null.
53950 if (isUndefOrInRange(LMask, 0, NumElts))
53951 B = SDValue();
53952 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53953 A = SDValue();
53954
53955 if (isUndefOrInRange(RMask, 0, NumElts))
53956 D = SDValue();
53957 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53958 C = SDValue();
53959
53960 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53961 // RHS operands and shuffle mask.
53962 if (A != C) {
53963 std::swap(C, D);
53965 }
53966 // Check that the shuffles are both shuffling the same vectors.
53967 if (!(A == C && B == D))
53968 return false;
53969
53970 PostShuffleMask.clear();
53971 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53972
53973 // LHS and RHS are now:
53974 // LHS = shuffle A, B, LMask
53975 // RHS = shuffle A, B, RMask
53976 // Check that the masks correspond to performing a horizontal operation.
53977 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53978 // so we just repeat the inner loop if this is a 256-bit op.
53979 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53980 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53981 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53982 assert((NumEltsPer128BitChunk % 2 == 0) &&
53983 "Vector type should have an even number of elements in each lane");
53984 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53985 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53986 // Ignore undefined components.
53987 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53988 if (LIdx < 0 || RIdx < 0 ||
53989 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53990 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53991 continue;
53992
53993 // Check that successive odd/even elements are being operated on. If not,
53994 // this is not a horizontal operation.
53995 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53996 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53997 return false;
53998
53999 // Compute the post-shuffle mask index based on where the element
54000 // is stored in the HOP result, and where it needs to be moved to.
54001 int Base = LIdx & ~1u;
54002 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
54003 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
54004
54005 // The low half of the 128-bit result must choose from A.
54006 // The high half of the 128-bit result must choose from B,
54007 // unless B is undef. In that case, we are always choosing from A.
54008 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
54009 Index += NumEltsPer64BitChunk;
54010 PostShuffleMask[i + j] = Index;
54011 }
54012 }
54013
54014 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
54015 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
54016
54017 bool IsIdentityPostShuffle =
54018 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
54019 if (IsIdentityPostShuffle)
54020 PostShuffleMask.clear();
54021
54022 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
54023 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
54024 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
54025 return false;
54026
54027 // If the source nodes are already used in HorizOps then always accept this.
54028 // Shuffle folding should merge these back together.
54029 auto FoundHorizUser = [&](SDNode *User) {
54030 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
54031 };
54032 ForceHorizOp =
54033 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54034 llvm::any_of(NewRHS->users(), FoundHorizUser));
54035
54036 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54037 // shuffle the result.
54038 if (!ForceHorizOp &&
54039 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54040 (NumShuffles < 2 || !IsIdentityPostShuffle),
54041 DAG, Subtarget))
54042 return false;
54043
54044 LHS = DAG.getBitcast(VT, NewLHS);
54045 RHS = DAG.getBitcast(VT, NewRHS);
54046 return true;
54047}
54048
54049// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54051 const X86Subtarget &Subtarget) {
54052 EVT VT = N->getValueType(0);
54053 unsigned Opcode = N->getOpcode();
54054 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54055 SmallVector<int, 8> PostShuffleMask;
54056
54057 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54058 return N->hasOneUse() &&
54059 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54060 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54061 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54062 };
54063
54064 switch (Opcode) {
54065 case ISD::FADD:
54066 case ISD::FSUB:
54067 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54068 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54069 SDValue LHS = N->getOperand(0);
54070 SDValue RHS = N->getOperand(1);
54071 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54072 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54073 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54074 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54075 if (!PostShuffleMask.empty())
54076 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54077 DAG.getUNDEF(VT), PostShuffleMask);
54078 return HorizBinOp;
54079 }
54080 }
54081 break;
54082 case ISD::ADD:
54083 case ISD::SUB:
54084 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54085 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54086 SDValue LHS = N->getOperand(0);
54087 SDValue RHS = N->getOperand(1);
54088 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54089 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54090 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54091 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54093 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54094 };
54095 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54096 {LHS, RHS}, HOpBuilder);
54097 if (!PostShuffleMask.empty())
54098 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54099 DAG.getUNDEF(VT), PostShuffleMask);
54100 return HorizBinOp;
54101 }
54102 }
54103 break;
54104 }
54105
54106 return SDValue();
54107}
54108
54109// Try to combine the following nodes
54110// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54111// <i32 -2147483648[float -0.000000e+00]> 0
54112// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54113// <(load 4 from constant-pool)> t0, t29
54114// [t30: v16i32 = bitcast t27]
54115// t6: v16i32 = xor t7, t27[t30]
54116// t11: v16f32 = bitcast t6
54117// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54118// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54119// t22: v16f32 = bitcast t7
54120// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54121// t24: v32f16 = bitcast t23
54123 const X86Subtarget &Subtarget) {
54124 EVT VT = N->getValueType(0);
54125 SDValue LHS = N->getOperand(0);
54126 SDValue RHS = N->getOperand(1);
54127 int CombineOpcode =
54128 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54129 auto combineConjugation = [&](SDValue &r) {
54130 if (LHS->getOpcode() == ISD::BITCAST) {
54131 SDValue XOR = LHS.getOperand(0);
54132 if (XOR->getOpcode() == ISD::XOR) {
54133 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54134 if (XORRHS.isConstant()) {
54135 APInt ConjugationInt32 = APInt(32, 0x80000000);
54136 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54137 if ((XORRHS.getBitWidth() == 32 &&
54138 XORRHS.getConstant() == ConjugationInt32) ||
54139 (XORRHS.getBitWidth() == 64 &&
54140 XORRHS.getConstant() == ConjugationInt64)) {
54141 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54142 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54143 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54144 r = DAG.getBitcast(VT, FCMulC);
54145 return true;
54146 }
54147 }
54148 }
54149 }
54150 return false;
54151 };
54152 SDValue Res;
54153 if (combineConjugation(Res))
54154 return Res;
54155 std::swap(LHS, RHS);
54156 if (combineConjugation(Res))
54157 return Res;
54158 return Res;
54159}
54160
54161// Try to combine the following nodes:
54162// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54164 const X86Subtarget &Subtarget) {
54165 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54167 Flags.hasAllowContract();
54168 };
54169
54170 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54171 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54172 Flags.hasNoSignedZeros();
54173 };
54174 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54175 APInt AI = APInt(32, 0x80008000);
54176 KnownBits Bits = DAG.computeKnownBits(Op);
54177 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54178 Bits.getConstant() == AI;
54179 };
54180
54181 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54182 !AllowContract(N->getFlags()))
54183 return SDValue();
54184
54185 EVT VT = N->getValueType(0);
54186 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54187 return SDValue();
54188
54189 SDValue LHS = N->getOperand(0);
54190 SDValue RHS = N->getOperand(1);
54191 bool IsConj;
54192 SDValue FAddOp1, MulOp0, MulOp1;
54193 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54194 &IsVectorAllNegativeZero,
54195 &HasNoSignedZero](SDValue N) -> bool {
54196 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54197 return false;
54198 SDValue Op0 = N.getOperand(0);
54199 unsigned Opcode = Op0.getOpcode();
54200 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54201 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54202 MulOp0 = Op0.getOperand(0);
54203 MulOp1 = Op0.getOperand(1);
54204 IsConj = Opcode == X86ISD::VFCMULC;
54205 return true;
54206 }
54207 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54209 HasNoSignedZero(Op0->getFlags())) ||
54210 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54211 MulOp0 = Op0.getOperand(0);
54212 MulOp1 = Op0.getOperand(1);
54213 IsConj = Opcode == X86ISD::VFCMADDC;
54214 return true;
54215 }
54216 }
54217 return false;
54218 };
54219
54220 if (GetCFmulFrom(LHS))
54221 FAddOp1 = RHS;
54222 else if (GetCFmulFrom(RHS))
54223 FAddOp1 = LHS;
54224 else
54225 return SDValue();
54226
54227 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54228 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54229 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54230 // FIXME: How do we handle when fast math flags of FADD are different from
54231 // CFMUL's?
54232 SDValue CFmul =
54233 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54234 return DAG.getBitcast(VT, CFmul);
54235}
54236
54237/// Do target-specific dag combines on floating-point adds/subs.
54239 const X86Subtarget &Subtarget) {
54240 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54241 return HOp;
54242
54243 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54244 return COp;
54245
54246 return SDValue();
54247}
54248
54250 const X86Subtarget &Subtarget) {
54251 EVT VT = N->getValueType(0);
54252 SDValue Src = N->getOperand(0);
54253 EVT SrcVT = Src.getValueType();
54254 SDLoc DL(N);
54255
54256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54257
54258 // Let legalize expand this if it isn't a legal type yet.
54259 if (!TLI.isTypeLegal(VT))
54260 return SDValue();
54261
54262 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54263 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54264 return SDValue();
54265
54266 if (SrcVT == MVT::v2f16) {
54267 SrcVT = MVT::v4f16;
54268 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54269 DAG.getUNDEF(MVT::v2f16));
54270 }
54271
54272 if (SrcVT == MVT::v4f16) {
54273 SrcVT = MVT::v8f16;
54274 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54275 DAG.getUNDEF(MVT::v4f16));
54276 } else if (SrcVT == MVT::v2f32) {
54277 SrcVT = MVT::v4f32;
54278 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54279 DAG.getUNDEF(MVT::v2f32));
54280 } else {
54281 return SDValue();
54282 }
54283
54284 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54285}
54286
54287// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54288// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54289// are able to avoid generating code with MOVABS and large constants in certain
54290// cases.
54292 const SDLoc &DL) {
54293 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54294 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54295 if (!ValidSrlConst)
54296 return SDValue();
54297 unsigned SrlConstVal = *ValidSrlConst;
54298
54299 SDValue Op = N.getOperand(0);
54300 unsigned Opcode = Op.getOpcode();
54301 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54302 "Illegal truncation types");
54303
54304 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54305 !isa<ConstantSDNode>(Op.getOperand(1)))
54306 return SDValue();
54307 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54308
54309 if (SrlConstVal <= 32 ||
54310 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54311 return SDValue();
54312
54313 SDValue OpLhsSrl =
54314 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54315 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54316
54317 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54318 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54319 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54320
54321 if (Opcode == ISD::ADD) {
54322 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54323 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54324 }
54325 return NewOpNode;
54326}
54327
54328/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54329/// the codegen.
54330/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54331/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54332/// anything that is guaranteed to be transformed by DAGCombiner.
54334 const X86Subtarget &Subtarget,
54335 const SDLoc &DL) {
54336 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54337 SDValue Src = N->getOperand(0);
54338 unsigned SrcOpcode = Src.getOpcode();
54339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54340
54341 EVT VT = N->getValueType(0);
54342 EVT SrcVT = Src.getValueType();
54343
54344 auto IsFreeTruncation = [VT](SDValue Op) {
54345 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54346
54347 // See if this has been extended from a smaller/equal size to
54348 // the truncation size, allowing a truncation to combine with the extend.
54349 unsigned Opcode = Op.getOpcode();
54350 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54351 Opcode == ISD::ZERO_EXTEND) &&
54352 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54353 return true;
54354
54355 // See if this is a single use constant which can be constant folded.
54356 // NOTE: We don't peek throught bitcasts here because there is currently
54357 // no support for constant folding truncate+bitcast+vector_of_constants. So
54358 // we'll just send up with a truncate on both operands which will
54359 // get turned back into (truncate (binop)) causing an infinite loop.
54360 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54361 };
54362
54363 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54364 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54365 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54366 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54367 };
54368
54369 // Don't combine if the operation has other uses.
54370 if (!Src.hasOneUse())
54371 return SDValue();
54372
54373 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54374 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54375
54376 if (!VT.isVector())
54377 return SDValue();
54378
54379 // In most cases its only worth pre-truncating if we're only facing the cost
54380 // of one truncation.
54381 // i.e. if one of the inputs will constant fold or the input is repeated.
54382 switch (SrcOpcode) {
54383 case ISD::MUL:
54384 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54385 // better to truncate if we have the chance.
54386 if (SrcVT.getScalarType() == MVT::i64 &&
54387 TLI.isOperationLegal(SrcOpcode, VT) &&
54388 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54389 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54390 [[fallthrough]];
54391 case ISD::AND:
54392 case ISD::XOR:
54393 case ISD::OR:
54394 case ISD::ADD:
54395 case ISD::SUB: {
54396 SDValue Op0 = Src.getOperand(0);
54397 SDValue Op1 = Src.getOperand(1);
54398 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54399 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54400 return TruncateArithmetic(Op0, Op1);
54401 break;
54402 }
54403 }
54404
54405 return SDValue();
54406}
54407
54408// Try to form a MULHU or MULHS node by looking for
54409// (trunc (srl (mul ext, ext), >= 16))
54410// TODO: This is X86 specific because we want to be able to handle wide types
54411// before type legalization. But we can only do it if the vector will be
54412// legalized via widening/splitting. Type legalization can't handle promotion
54413// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54414// combiner.
54415static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54416 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54417 using namespace llvm::SDPatternMatch;
54418
54419 if (!Subtarget.hasSSE2())
54420 return SDValue();
54421
54422 // Only handle vXi16 types that are at least 128-bits unless they will be
54423 // widened.
54424 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54425 return SDValue();
54426
54427 // Input type should be at least vXi32.
54428 EVT InVT = Src.getValueType();
54429 if (InVT.getVectorElementType().getSizeInBits() < 32)
54430 return SDValue();
54431
54432 // First instruction should be a right shift by 16 of a multiply.
54433 SDValue LHS, RHS;
54434 APInt ShiftAmt;
54435 if (!sd_match(Src,
54436 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54437 return SDValue();
54438
54439 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54440 return SDValue();
54441
54442 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54443
54444 // Count leading sign/zero bits on both inputs - if there are enough then
54445 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54446 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54447 // truncations may actually be free by peeking through to the ext source.
54448 auto IsSext = [&DAG](SDValue V) {
54449 return DAG.ComputeMaxSignificantBits(V) <= 16;
54450 };
54451 auto IsZext = [&DAG](SDValue V) {
54452 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54453 };
54454
54455 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54456 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54457 if (!IsSigned && !IsUnsigned)
54458 return SDValue();
54459
54460 // Check if both inputs are extensions, which will be removed by truncation.
54461 auto isOpTruncateFree = [](SDValue Op) {
54462 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54463 Op.getOpcode() == ISD::ZERO_EXTEND)
54464 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54465 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54466 };
54467 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54468
54469 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54470 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54471 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54472 // will have to split anyway.
54473 unsigned InSizeInBits = InVT.getSizeInBits();
54474 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54475 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54476 (InSizeInBits % 16) == 0) {
54477 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54478 InVT.getSizeInBits() / 16);
54479 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54480 DAG.getBitcast(BCVT, RHS));
54481 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54482 return DAG.getNode(ISD::SRL, DL, VT, Res,
54483 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54484 }
54485
54486 // Truncate back to source type.
54487 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54488 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54489
54490 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54491 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54492 return DAG.getNode(ISD::SRL, DL, VT, Res,
54493 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54494}
54495
54496// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54497// from one vector with signed bytes from another vector, adds together
54498// adjacent pairs of 16-bit products, and saturates the result before
54499// truncating to 16-bits.
54500//
54501// Which looks something like this:
54502// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54503// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54505 const X86Subtarget &Subtarget,
54506 const SDLoc &DL) {
54507 if (!VT.isVector() || !Subtarget.hasSSSE3())
54508 return SDValue();
54509
54510 unsigned NumElems = VT.getVectorNumElements();
54511 EVT ScalarVT = VT.getVectorElementType();
54512 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54513 return SDValue();
54514
54515 SDValue SSatVal = detectSSatPattern(In, VT);
54516 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54517 return SDValue();
54518
54519 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54520 // of multiplies from even/odd elements.
54521 SDValue N0 = SSatVal.getOperand(0);
54522 SDValue N1 = SSatVal.getOperand(1);
54523
54524 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54525 return SDValue();
54526
54527 SDValue N00 = N0.getOperand(0);
54528 SDValue N01 = N0.getOperand(1);
54529 SDValue N10 = N1.getOperand(0);
54530 SDValue N11 = N1.getOperand(1);
54531
54532 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54533 // Canonicalize zero_extend to LHS.
54534 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54535 std::swap(N00, N01);
54536 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54537 std::swap(N10, N11);
54538
54539 // Ensure we have a zero_extend and a sign_extend.
54540 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54541 N01.getOpcode() != ISD::SIGN_EXTEND ||
54542 N10.getOpcode() != ISD::ZERO_EXTEND ||
54543 N11.getOpcode() != ISD::SIGN_EXTEND)
54544 return SDValue();
54545
54546 // Peek through the extends.
54547 N00 = N00.getOperand(0);
54548 N01 = N01.getOperand(0);
54549 N10 = N10.getOperand(0);
54550 N11 = N11.getOperand(0);
54551
54552 // Ensure the extend is from vXi8.
54553 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54554 N01.getValueType().getVectorElementType() != MVT::i8 ||
54555 N10.getValueType().getVectorElementType() != MVT::i8 ||
54556 N11.getValueType().getVectorElementType() != MVT::i8)
54557 return SDValue();
54558
54559 // All inputs should be build_vectors.
54560 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54561 N01.getOpcode() != ISD::BUILD_VECTOR ||
54562 N10.getOpcode() != ISD::BUILD_VECTOR ||
54564 return SDValue();
54565
54566 // N00/N10 are zero extended. N01/N11 are sign extended.
54567
54568 // For each element, we need to ensure we have an odd element from one vector
54569 // multiplied by the odd element of another vector and the even element from
54570 // one of the same vectors being multiplied by the even element from the
54571 // other vector. So we need to make sure for each element i, this operator
54572 // is being performed:
54573 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54574 SDValue ZExtIn, SExtIn;
54575 for (unsigned i = 0; i != NumElems; ++i) {
54576 SDValue N00Elt = N00.getOperand(i);
54577 SDValue N01Elt = N01.getOperand(i);
54578 SDValue N10Elt = N10.getOperand(i);
54579 SDValue N11Elt = N11.getOperand(i);
54580 // TODO: Be more tolerant to undefs.
54581 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54582 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54583 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54585 return SDValue();
54586 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54587 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54588 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54589 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54590 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54591 return SDValue();
54592 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54593 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54594 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54595 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54596 // Add is commutative so indices can be reordered.
54597 if (IdxN00 > IdxN10) {
54598 std::swap(IdxN00, IdxN10);
54599 std::swap(IdxN01, IdxN11);
54600 }
54601 // N0 indices be the even element. N1 indices must be the next odd element.
54602 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54603 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54604 return SDValue();
54605 SDValue N00In = N00Elt.getOperand(0);
54606 SDValue N01In = N01Elt.getOperand(0);
54607 SDValue N10In = N10Elt.getOperand(0);
54608 SDValue N11In = N11Elt.getOperand(0);
54609 // First time we find an input capture it.
54610 if (!ZExtIn) {
54611 ZExtIn = N00In;
54612 SExtIn = N01In;
54613 }
54614 if (ZExtIn != N00In || SExtIn != N01In ||
54615 ZExtIn != N10In || SExtIn != N11In)
54616 return SDValue();
54617 }
54618
54619 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54620 EVT ExtVT = Ext.getValueType();
54621 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54622 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54623 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54624 DAG.getVectorIdxConstant(0, DL));
54625 }
54626 };
54627 ExtractVec(ZExtIn);
54628 ExtractVec(SExtIn);
54629
54630 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54632 // Shrink by adding truncate nodes and let DAGCombine fold with the
54633 // sources.
54634 EVT InVT = Ops[0].getValueType();
54635 assert(InVT.getScalarType() == MVT::i8 &&
54636 "Unexpected scalar element type");
54637 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54638 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54639 InVT.getVectorNumElements() / 2);
54640 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54641 };
54642 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54643 PMADDBuilder);
54644}
54645
54647 const X86Subtarget &Subtarget) {
54648 EVT VT = N->getValueType(0);
54649 SDValue Src = N->getOperand(0);
54650 SDLoc DL(N);
54651
54652 // Attempt to pre-truncate inputs to arithmetic ops instead.
54653 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54654 return V;
54655
54656 // Try to detect PMADD
54657 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54658 return PMAdd;
54659
54660 // Try to combine truncation with signed/unsigned saturation.
54661 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54662 return Val;
54663
54664 // Try to combine PMULHUW/PMULHW for vXi16.
54665 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54666 return V;
54667
54668 // The bitcast source is a direct mmx result.
54669 // Detect bitcasts between i32 to x86mmx
54670 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54671 SDValue BCSrc = Src.getOperand(0);
54672 if (BCSrc.getValueType() == MVT::x86mmx)
54673 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54674 }
54675
54676 return SDValue();
54677}
54678
54681 EVT VT = N->getValueType(0);
54682 SDValue In = N->getOperand(0);
54683 SDLoc DL(N);
54684
54685 if (SDValue SSatVal = detectSSatPattern(In, VT))
54686 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54687 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54688 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54689
54690 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54691 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54692 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54693 return SDValue(N, 0);
54694
54695 return SDValue();
54696}
54697
54698/// Returns the negated value if the node \p N flips sign of FP value.
54699///
54700/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54701/// or FSUB(0, x)
54702/// AVX512F does not have FXOR, so FNEG is lowered as
54703/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54704/// In this case we go though all bitcasts.
54705/// This also recognizes splat of a negated value and returns the splat of that
54706/// value.
54707static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54708 if (N->getOpcode() == ISD::FNEG)
54709 return N->getOperand(0);
54710
54711 // Don't recurse exponentially.
54713 return SDValue();
54714
54715 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54716
54718 EVT VT = Op->getValueType(0);
54719
54720 // Make sure the element size doesn't change.
54721 if (VT.getScalarSizeInBits() != ScalarSize)
54722 return SDValue();
54723
54724 unsigned Opc = Op.getOpcode();
54725 switch (Opc) {
54726 case ISD::VECTOR_SHUFFLE: {
54727 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54728 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54729 if (!Op.getOperand(1).isUndef())
54730 return SDValue();
54731 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54732 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54733 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54734 cast<ShuffleVectorSDNode>(Op)->getMask());
54735 break;
54736 }
54738 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54739 // -V, INDEX).
54740 SDValue InsVector = Op.getOperand(0);
54741 SDValue InsVal = Op.getOperand(1);
54742 if (!InsVector.isUndef())
54743 return SDValue();
54744 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54745 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54746 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54747 NegInsVal, Op.getOperand(2));
54748 break;
54749 }
54750 case ISD::FSUB:
54751 case ISD::XOR:
54752 case X86ISD::FXOR: {
54753 SDValue Op1 = Op.getOperand(1);
54754 SDValue Op0 = Op.getOperand(0);
54755
54756 // For XOR and FXOR, we want to check if constant
54757 // bits of Op1 are sign bit masks. For FSUB, we
54758 // have to check if constant bits of Op0 are sign
54759 // bit masks and hence we swap the operands.
54760 if (Opc == ISD::FSUB)
54761 std::swap(Op0, Op1);
54762
54763 APInt UndefElts;
54764 SmallVector<APInt, 16> EltBits;
54765 // Extract constant bits and see if they are all
54766 // sign bit masks. Ignore the undef elements.
54767 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54768 /* AllowWholeUndefs */ true,
54769 /* AllowPartialUndefs */ false)) {
54770 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54771 if (!UndefElts[I] && !EltBits[I].isSignMask())
54772 return SDValue();
54773
54774 // Only allow bitcast from correctly-sized constant.
54775 Op0 = peekThroughBitcasts(Op0);
54776 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54777 return Op0;
54778 }
54779 break;
54780 } // case
54781 } // switch
54782
54783 return SDValue();
54784}
54785
54786static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54787 bool NegRes) {
54788 if (NegMul) {
54789 switch (Opcode) {
54790 // clang-format off
54791 default: llvm_unreachable("Unexpected opcode");
54792 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54793 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54794 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54795 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54796 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54797 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54798 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54799 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54800 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54801 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54802 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54803 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54804 // clang-format on
54805 }
54806 }
54807
54808 if (NegAcc) {
54809 switch (Opcode) {
54810 // clang-format off
54811 default: llvm_unreachable("Unexpected opcode");
54812 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54813 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54814 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54815 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54816 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54817 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54818 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54819 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54820 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54821 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54822 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54823 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54824 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54825 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54826 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54827 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54828 // clang-format on
54829 }
54830 }
54831
54832 if (NegRes) {
54833 switch (Opcode) {
54834 // For accuracy reason, we never combine fneg and fma under strict FP.
54835 // clang-format off
54836 default: llvm_unreachable("Unexpected opcode");
54837 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54838 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54839 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54840 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54841 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54842 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54843 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54844 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54845 // clang-format on
54846 }
54847 }
54848
54849 return Opcode;
54850}
54851
54852/// Do target-specific dag combines on floating point negations.
54855 const X86Subtarget &Subtarget) {
54856 EVT OrigVT = N->getValueType(0);
54857 SDValue Arg = isFNEG(DAG, N);
54858 if (!Arg)
54859 return SDValue();
54860
54861 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54862 EVT VT = Arg.getValueType();
54863 EVT SVT = VT.getScalarType();
54864 SDLoc DL(N);
54865
54866 // Let legalize expand this if it isn't a legal type yet.
54867 if (!TLI.isTypeLegal(VT))
54868 return SDValue();
54869
54870 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54871 // use of a constant by performing (-0 - A*B) instead.
54872 // FIXME: Check rounding control flags as well once it becomes available.
54873 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54874 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54875 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54876 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54877 Arg.getOperand(1), Zero);
54878 return DAG.getBitcast(OrigVT, NewNode);
54879 }
54880
54882 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54883 if (SDValue NegArg =
54884 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54885 return DAG.getBitcast(OrigVT, NegArg);
54886
54887 return SDValue();
54888}
54889
54891 bool LegalOperations,
54892 bool ForCodeSize,
54894 unsigned Depth) const {
54895 // fneg patterns are removable even if they have multiple uses.
54896 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54898 return DAG.getBitcast(Op.getValueType(), Arg);
54899 }
54900
54901 EVT VT = Op.getValueType();
54902 EVT SVT = VT.getScalarType();
54903 unsigned Opc = Op.getOpcode();
54904 SDNodeFlags Flags = Op.getNode()->getFlags();
54905 switch (Opc) {
54906 case ISD::FMA:
54907 case X86ISD::FMSUB:
54908 case X86ISD::FNMADD:
54909 case X86ISD::FNMSUB:
54910 case X86ISD::FMADD_RND:
54911 case X86ISD::FMSUB_RND:
54912 case X86ISD::FNMADD_RND:
54913 case X86ISD::FNMSUB_RND: {
54914 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54915 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54917 break;
54918
54919 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54920 // if it may have signed zeros.
54921 if (!Flags.hasNoSignedZeros())
54922 break;
54923
54924 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54925 // keep temporary nodes alive.
54926 std::list<HandleSDNode> Handles;
54927
54928 // This is always negatible for free but we might be able to remove some
54929 // extra operand negations as well.
54930 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54931 for (int i = 0; i != 3; ++i) {
54932 NewOps[i] = getCheaperNegatedExpression(
54933 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54934 if (!!NewOps[i])
54935 Handles.emplace_back(NewOps[i]);
54936 }
54937
54938 bool NegA = !!NewOps[0];
54939 bool NegB = !!NewOps[1];
54940 bool NegC = !!NewOps[2];
54941 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54942
54943 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54945
54946 // Fill in the non-negated ops with the original values.
54947 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54948 if (!NewOps[i])
54949 NewOps[i] = Op.getOperand(i);
54950 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54951 }
54952 case X86ISD::FRCP:
54953 if (SDValue NegOp0 =
54954 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54955 ForCodeSize, Cost, Depth + 1))
54956 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54957 break;
54958 }
54959
54960 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54961 ForCodeSize, Cost, Depth);
54962}
54963
54965 const X86Subtarget &Subtarget) {
54966 MVT VT = N->getSimpleValueType(0);
54967 // If we have integer vector types available, use the integer opcodes.
54968 if (!VT.isVector() || !Subtarget.hasSSE2())
54969 return SDValue();
54970
54971 SDLoc dl(N);
54973 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54974 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54975 unsigned IntOpcode;
54976 switch (N->getOpcode()) {
54977 // clang-format off
54978 default: llvm_unreachable("Unexpected FP logic op");
54979 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54980 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54981 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54982 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54983 // clang-format on
54984 }
54985 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54986 return DAG.getBitcast(VT, IntOp);
54987}
54988
54989/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54991 if (N->getOpcode() != ISD::XOR)
54992 return SDValue();
54993
54994 SDValue LHS = N->getOperand(0);
54995 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54996 return SDValue();
54997
54999 X86::CondCode(LHS->getConstantOperandVal(0)));
55000 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
55001}
55002
55004 const X86Subtarget &Subtarget) {
55005 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
55006 "Invalid opcode for combing with CTLZ");
55007 if (Subtarget.hasFastLZCNT())
55008 return SDValue();
55009
55010 EVT VT = N->getValueType(0);
55011 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
55012 (VT != MVT::i64 || !Subtarget.is64Bit()))
55013 return SDValue();
55014
55015 SDValue N0 = N->getOperand(0);
55016 SDValue N1 = N->getOperand(1);
55017
55018 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
55020 return SDValue();
55021
55022 SDValue OpCTLZ;
55023 SDValue OpSizeTM1;
55024
55025 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
55026 OpCTLZ = N1;
55027 OpSizeTM1 = N0;
55028 } else if (N->getOpcode() == ISD::SUB) {
55029 return SDValue();
55030 } else {
55031 OpCTLZ = N0;
55032 OpSizeTM1 = N1;
55033 }
55034
55035 if (!OpCTLZ.hasOneUse())
55036 return SDValue();
55037 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55038 if (!C)
55039 return SDValue();
55040
55041 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55042 return SDValue();
55043 EVT OpVT = VT;
55044 SDValue Op = OpCTLZ.getOperand(0);
55045 if (VT == MVT::i8) {
55046 // Zero extend to i32 since there is not an i8 bsr.
55047 OpVT = MVT::i32;
55048 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55049 }
55050
55051 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55052 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55053 if (VT == MVT::i8)
55054 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55055
55056 return Op;
55057}
55058
55061 const X86Subtarget &Subtarget) {
55062 SDValue N0 = N->getOperand(0);
55063 SDValue N1 = N->getOperand(1);
55064 EVT VT = N->getValueType(0);
55065 SDLoc DL(N);
55066
55067 // If this is SSE1 only convert to FXOR to avoid scalarization.
55068 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55069 return DAG.getBitcast(MVT::v4i32,
55070 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55071 DAG.getBitcast(MVT::v4f32, N0),
55072 DAG.getBitcast(MVT::v4f32, N1)));
55073 }
55074
55075 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55076 return Cmp;
55077
55078 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55079 return R;
55080
55081 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55082 return R;
55083
55084 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55085 return R;
55086
55087 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55088 DAG, DCI, Subtarget))
55089 return FPLogic;
55090
55091 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55092 return R;
55093
55094 if (DCI.isBeforeLegalizeOps())
55095 return SDValue();
55096
55097 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55098 return SetCC;
55099
55100 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55101 return R;
55102
55103 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55104 return RV;
55105
55106 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55108 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55109 N0.getOperand(0).getValueType().isVector() &&
55110 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55111 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55112 return DAG.getBitcast(
55113 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55114 }
55115
55116 // Handle AVX512 mask widening.
55117 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55118 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55119 VT.getVectorElementType() == MVT::i1 &&
55121 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55122 return DAG.getNode(
55124 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55125 N0.getOperand(2));
55126 }
55127
55128 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55129 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55130 // TODO: Under what circumstances could this be performed in DAGCombine?
55131 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55132 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55133 SDValue TruncExtSrc = N0.getOperand(0);
55134 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55135 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55136 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55137 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55138 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55139 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55140 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55141 }
55142 }
55143
55144 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55145 return R;
55146
55147 return combineFneg(N, DAG, DCI, Subtarget);
55148}
55149
55152 const X86Subtarget &Subtarget) {
55153 SDValue N0 = N->getOperand(0);
55154 EVT VT = N->getValueType(0);
55155
55156 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55157 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55158 SDValue Src = N0.getOperand(0);
55159 EVT SrcVT = Src.getValueType();
55160 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55161 (DCI.isBeforeLegalize() ||
55162 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55163 Subtarget.hasSSSE3()) {
55164 unsigned NumElts = SrcVT.getVectorNumElements();
55165 SmallVector<int, 32> ReverseMask(NumElts);
55166 for (unsigned I = 0; I != NumElts; ++I)
55167 ReverseMask[I] = (NumElts - 1) - I;
55168 SDValue Rev =
55169 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55170 return DAG.getBitcast(VT, Rev);
55171 }
55172 }
55173
55174 return SDValue();
55175}
55176
55177// Various combines to try to convert to avgceilu.
55180 const X86Subtarget &Subtarget) {
55181 unsigned Opcode = N->getOpcode();
55182 SDValue N0 = N->getOperand(0);
55183 SDValue N1 = N->getOperand(1);
55184 EVT VT = N->getValueType(0);
55185 EVT SVT = VT.getScalarType();
55186 SDLoc DL(N);
55187
55188 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55189 // Only useful on vXi8 which doesn't have good SRA handling.
55190 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55192 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55193 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55194 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55195 return DAG.getNode(ISD::XOR, DL, VT,
55196 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55197 }
55198
55199 return SDValue();
55200}
55201
55204 const X86Subtarget &Subtarget) {
55205 EVT VT = N->getValueType(0);
55206 unsigned NumBits = VT.getSizeInBits();
55207
55208 // TODO - Constant Folding.
55209
55210 // Simplify the inputs.
55211 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55212 APInt DemandedMask(APInt::getAllOnes(NumBits));
55213 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55214 return SDValue(N, 0);
55215
55216 return SDValue();
55217}
55218
55220 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55221}
55222
55223/// If a value is a scalar FP zero or a vector FP zero (potentially including
55224/// undefined elements), return a zero constant that may be used to fold away
55225/// that value. In the case of a vector, the returned constant will not contain
55226/// undefined elements even if the input parameter does. This makes it suitable
55227/// to be used as a replacement operand with operations (eg, bitwise-and) where
55228/// an undef should not propagate.
55230 const X86Subtarget &Subtarget) {
55232 return SDValue();
55233
55234 if (V.getValueType().isVector())
55235 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55236
55237 return V;
55238}
55239
55241 const X86Subtarget &Subtarget) {
55242 SDValue N0 = N->getOperand(0);
55243 SDValue N1 = N->getOperand(1);
55244 EVT VT = N->getValueType(0);
55245 SDLoc DL(N);
55246
55247 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55248 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55249 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55250 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55251 return SDValue();
55252
55253 auto isAllOnesConstantFP = [](SDValue V) {
55254 if (V.getSimpleValueType().isVector())
55255 return ISD::isBuildVectorAllOnes(V.getNode());
55256 auto *C = dyn_cast<ConstantFPSDNode>(V);
55257 return C && C->getConstantFPValue()->isAllOnesValue();
55258 };
55259
55260 // fand (fxor X, -1), Y --> fandn X, Y
55261 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55262 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55263
55264 // fand X, (fxor Y, -1) --> fandn Y, X
55265 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55266 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55267
55268 return SDValue();
55269}
55270
55271/// Do target-specific dag combines on X86ISD::FAND nodes.
55273 const X86Subtarget &Subtarget) {
55274 // FAND(0.0, x) -> 0.0
55275 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55276 return V;
55277
55278 // FAND(x, 0.0) -> 0.0
55279 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55280 return V;
55281
55282 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55283 return V;
55284
55285 return lowerX86FPLogicOp(N, DAG, Subtarget);
55286}
55287
55288/// Do target-specific dag combines on X86ISD::FANDN nodes.
55290 const X86Subtarget &Subtarget) {
55291 // FANDN(0.0, x) -> x
55292 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55293 return N->getOperand(1);
55294
55295 // FANDN(x, 0.0) -> 0.0
55296 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55297 return V;
55298
55299 return lowerX86FPLogicOp(N, DAG, Subtarget);
55300}
55301
55302/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55305 const X86Subtarget &Subtarget) {
55306 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55307
55308 // F[X]OR(0.0, x) -> x
55309 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55310 return N->getOperand(1);
55311
55312 // F[X]OR(x, 0.0) -> x
55313 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55314 return N->getOperand(0);
55315
55316 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55317 return NewVal;
55318
55319 return lowerX86FPLogicOp(N, DAG, Subtarget);
55320}
55321
55322/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55324 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55325
55326 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55327 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55329 return SDValue();
55330
55331 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55332 // into FMINC and FMAXC, which are Commutative operations.
55333 unsigned NewOp = 0;
55334 switch (N->getOpcode()) {
55335 default: llvm_unreachable("unknown opcode");
55336 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55337 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55338 }
55339
55340 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55341 N->getOperand(0), N->getOperand(1));
55342}
55343
55345 const X86Subtarget &Subtarget) {
55346 EVT VT = N->getValueType(0);
55347 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55348 return SDValue();
55349
55350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55351
55352 auto IsMinMaxLegal = [&](EVT VT) {
55353 if (!TLI.isTypeLegal(VT))
55354 return false;
55355 return VT.getScalarType() != MVT::f16 ||
55356 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55357 };
55358
55359 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55360 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55361 (Subtarget.hasFP16() && VT == MVT::f16) ||
55362 (VT.isVector() && IsMinMaxLegal(VT))))
55363 return SDValue();
55364
55365 SDValue Op0 = N->getOperand(0);
55366 SDValue Op1 = N->getOperand(1);
55367 SDLoc DL(N);
55368 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55369
55370 // If we don't have to respect NaN inputs, this is a direct translation to x86
55371 // min/max instructions.
55372 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55373 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55374
55375 // If one of the operands is known non-NaN use the native min/max instructions
55376 // with the non-NaN input as second operand.
55377 if (DAG.isKnownNeverNaN(Op1))
55378 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55379 if (DAG.isKnownNeverNaN(Op0))
55380 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55381
55382 // If we have to respect NaN inputs, this takes at least 3 instructions.
55383 // Favor a library call when operating on a scalar and minimizing code size.
55384 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55385 return SDValue();
55386
55387 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55388 VT);
55389
55390 // There are 4 possibilities involving NaN inputs, and these are the required
55391 // outputs:
55392 // Op1
55393 // Num NaN
55394 // ----------------
55395 // Num | Max | Op0 |
55396 // Op0 ----------------
55397 // NaN | Op1 | NaN |
55398 // ----------------
55399 //
55400 // The SSE FP max/min instructions were not designed for this case, but rather
55401 // to implement:
55402 // Min = Op1 < Op0 ? Op1 : Op0
55403 // Max = Op1 > Op0 ? Op1 : Op0
55404 //
55405 // So they always return Op0 if either input is a NaN. However, we can still
55406 // use those instructions for fmaxnum by selecting away a NaN input.
55407
55408 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55409 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55410 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55411
55412 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55413 // are NaN, the NaN value of Op1 is the result.
55414 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55415}
55416
55419 EVT VT = N->getValueType(0);
55420 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55421
55422 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55423 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55424 return SDValue(N, 0);
55425
55426 // Convert a full vector load into vzload when not all bits are needed.
55427 SDValue In = N->getOperand(0);
55428 MVT InVT = In.getSimpleValueType();
55429 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55430 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55431 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55432 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55433 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55434 MVT MemVT = MVT::getIntegerVT(NumBits);
55435 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55436 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55437 SDLoc dl(N);
55438 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55439 DAG.getBitcast(InVT, VZLoad));
55440 DCI.CombineTo(N, Convert);
55441 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55443 return SDValue(N, 0);
55444 }
55445 }
55446
55447 return SDValue();
55448}
55449
55453 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55454 EVT VT = N->getValueType(0);
55455
55456 // Convert a full vector load into vzload when not all bits are needed.
55457 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55458 MVT InVT = In.getSimpleValueType();
55459 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55460 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55461 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55462 LoadSDNode *LN = cast<LoadSDNode>(In);
55463 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55464 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55465 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55466 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55467 SDLoc dl(N);
55468 if (IsStrict) {
55469 SDValue Convert =
55470 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55471 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55472 DCI.CombineTo(N, Convert, Convert.getValue(1));
55473 } else {
55474 SDValue Convert =
55475 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55476 DCI.CombineTo(N, Convert);
55477 }
55478 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55480 return SDValue(N, 0);
55481 }
55482 }
55483
55484 return SDValue();
55485}
55486
55487/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55490 const X86Subtarget &Subtarget) {
55491 SDValue N0 = N->getOperand(0);
55492 SDValue N1 = N->getOperand(1);
55493 MVT VT = N->getSimpleValueType(0);
55494 int NumElts = VT.getVectorNumElements();
55495 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55497 SDLoc DL(N);
55498
55499 // ANDNP(undef, x) -> 0
55500 // ANDNP(x, undef) -> 0
55501 if (N0.isUndef() || N1.isUndef())
55502 return DAG.getConstant(0, DL, VT);
55503
55504 // ANDNP(0, x) -> x
55506 return N1;
55507
55508 // ANDNP(x, 0) -> 0
55510 return DAG.getConstant(0, DL, VT);
55511
55512 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55514 return DAG.getNOT(DL, N0, VT);
55515
55516 // Turn ANDNP back to AND if input is inverted.
55517 if (SDValue Not = IsNOT(N0, DAG))
55518 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55519
55520 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55521 // to make use of predicated selects.
55522 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55523 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55524 SDValue Src = N0.getOperand(0);
55525 EVT SrcVT = Src.getValueType();
55526 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55527 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55528 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55529 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55530 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55531 getZeroVector(VT, Subtarget, DAG, DL));
55532 }
55533
55534 // Constant Folding
55535 APInt Undefs0, Undefs1;
55536 SmallVector<APInt> EltBits0, EltBits1;
55537 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55538 /*AllowWholeUndefs*/ true,
55539 /*AllowPartialUndefs*/ true)) {
55540 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55541 /*AllowWholeUndefs*/ true,
55542 /*AllowPartialUndefs*/ true)) {
55543 SmallVector<APInt> ResultBits;
55544 for (int I = 0; I != NumElts; ++I)
55545 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55546 return getConstVector(ResultBits, VT, DAG, DL);
55547 }
55548
55549 // Constant fold NOT(N0) to allow us to use AND.
55550 // Ensure this is only performed if we can confirm that the bitcasted source
55551 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55552 if (N0->hasOneUse()) {
55554 if (BC0.getOpcode() != ISD::BITCAST) {
55555 for (APInt &Elt : EltBits0)
55556 Elt = ~Elt;
55557 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55558 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55559 }
55560 }
55561 }
55562
55563 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55564 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55565 SDValue Op(N, 0);
55566 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55567 return Res;
55568
55569 // If either operand is a constant mask, then only the elements that aren't
55570 // zero are actually demanded by the other operand.
55571 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55572 APInt UndefElts;
55573 SmallVector<APInt> EltBits;
55574 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55575 APInt DemandedElts = APInt::getAllOnes(NumElts);
55576 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55577 EltBits)) {
55578 DemandedBits.clearAllBits();
55579 DemandedElts.clearAllBits();
55580 for (int I = 0; I != NumElts; ++I) {
55581 if (UndefElts[I]) {
55582 // We can't assume an undef src element gives an undef dst - the
55583 // other src might be zero.
55584 DemandedBits.setAllBits();
55585 DemandedElts.setBit(I);
55586 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55587 (!Invert && !EltBits[I].isZero())) {
55588 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55589 DemandedElts.setBit(I);
55590 }
55591 }
55592 }
55593 return std::make_pair(DemandedBits, DemandedElts);
55594 };
55595 APInt Bits0, Elts0;
55596 APInt Bits1, Elts1;
55597 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55598 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55599
55600 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55601 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55602 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55603 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55604 if (N->getOpcode() != ISD::DELETED_NODE)
55605 DCI.AddToWorklist(N);
55606 return SDValue(N, 0);
55607 }
55608 }
55609
55610 // Folds for better commutativity:
55611 if (N1->hasOneUse()) {
55612 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55613 if (SDValue Not = IsNOT(N1, DAG))
55614 return DAG.getNOT(
55615 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55616
55617 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55618 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55619 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55621 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55622 EVT ShufVT = BC1.getValueType();
55623 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55624 DAG.getBitcast(ShufVT, N0));
55625 SDValue NewShuf =
55626 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55627 return DAG.getBitcast(VT, NewShuf);
55628 }
55629 }
55630 }
55631
55632 return SDValue();
55633}
55634
55637 SDValue N1 = N->getOperand(1);
55638
55639 // BT ignores high bits in the bit index operand.
55640 unsigned BitWidth = N1.getValueSizeInBits();
55642 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55643 if (N->getOpcode() != ISD::DELETED_NODE)
55644 DCI.AddToWorklist(N);
55645 return SDValue(N, 0);
55646 }
55647
55648 return SDValue();
55649}
55650
55653 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55654 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55655
55656 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55657 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55658 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55659 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55660 if (N->getOpcode() != ISD::DELETED_NODE)
55661 DCI.AddToWorklist(N);
55662 return SDValue(N, 0);
55663 }
55664
55665 // Convert a full vector load into vzload when not all bits are needed.
55666 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55667 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55668 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55669 SDLoc dl(N);
55670 if (IsStrict) {
55671 SDValue Convert = DAG.getNode(
55672 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55673 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55674 DCI.CombineTo(N, Convert, Convert.getValue(1));
55675 } else {
55676 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55677 DAG.getBitcast(MVT::v8i16, VZLoad));
55678 DCI.CombineTo(N, Convert);
55679 }
55680
55681 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55683 return SDValue(N, 0);
55684 }
55685 }
55686 }
55687
55688 return SDValue();
55689}
55690
55691// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55693 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55694
55695 EVT DstVT = N->getValueType(0);
55696
55697 SDValue N0 = N->getOperand(0);
55698 SDValue N1 = N->getOperand(1);
55699 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55700
55701 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55702 return SDValue();
55703
55704 // Look through single use any_extends / truncs.
55705 SDValue IntermediateBitwidthOp;
55706 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55707 N0.hasOneUse()) {
55708 IntermediateBitwidthOp = N0;
55709 N0 = N0.getOperand(0);
55710 }
55711
55712 // See if we have a single use cmov.
55713 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55714 return SDValue();
55715
55716 SDValue CMovOp0 = N0.getOperand(0);
55717 SDValue CMovOp1 = N0.getOperand(1);
55718
55719 // Make sure both operands are constants.
55720 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55721 !isa<ConstantSDNode>(CMovOp1.getNode()))
55722 return SDValue();
55723
55724 SDLoc DL(N);
55725
55726 // If we looked through an any_extend/trunc above, add one to the constants.
55727 if (IntermediateBitwidthOp) {
55728 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55729 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55730 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55731 }
55732
55733 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55734 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55735
55736 EVT CMovVT = DstVT;
55737 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55738 if (DstVT == MVT::i16) {
55739 CMovVT = MVT::i32;
55740 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55741 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55742 }
55743
55744 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55745 N0.getOperand(2), N0.getOperand(3));
55746
55747 if (CMovVT != DstVT)
55748 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55749
55750 return CMov;
55751}
55752
55754 const X86Subtarget &Subtarget) {
55755 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55756
55757 if (SDValue V = combineSextInRegCmov(N, DAG))
55758 return V;
55759
55760 EVT VT = N->getValueType(0);
55761 SDValue N0 = N->getOperand(0);
55762 SDValue N1 = N->getOperand(1);
55763 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55764 SDLoc dl(N);
55765
55766 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55767 // both SSE and AVX2 since there is no sign-extended shift right
55768 // operation on a vector with 64-bit elements.
55769 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55770 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55771 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55772 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55773 SDValue N00 = N0.getOperand(0);
55774
55775 // EXTLOAD has a better solution on AVX2,
55776 // it may be replaced with X86ISD::VSEXT node.
55777 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55778 if (!ISD::isNormalLoad(N00.getNode()))
55779 return SDValue();
55780
55781 // Attempt to promote any comparison mask ops before moving the
55782 // SIGN_EXTEND_INREG in the way.
55783 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55784 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55785
55786 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55787 SDValue Tmp =
55788 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55789 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55790 }
55791 }
55792 return SDValue();
55793}
55794
55795/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55796/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55797/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55798/// opportunities to combine math ops, use an LEA, or use a complex addressing
55799/// mode. This can eliminate extend, add, and shift instructions.
55801 const X86Subtarget &Subtarget) {
55802 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55803 Ext->getOpcode() != ISD::ZERO_EXTEND)
55804 return SDValue();
55805
55806 // TODO: This should be valid for other integer types.
55807 EVT VT = Ext->getValueType(0);
55808 if (VT != MVT::i64)
55809 return SDValue();
55810
55811 SDValue Add = Ext->getOperand(0);
55812 if (Add.getOpcode() != ISD::ADD)
55813 return SDValue();
55814
55815 SDValue AddOp0 = Add.getOperand(0);
55816 SDValue AddOp1 = Add.getOperand(1);
55817 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55818 bool NSW = Add->getFlags().hasNoSignedWrap();
55819 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55820 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55821 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55822
55823 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55824 // into the 'zext'
55825 if ((Sext && !NSW) || (!Sext && !NUW))
55826 return SDValue();
55827
55828 // Having a constant operand to the 'add' ensures that we are not increasing
55829 // the instruction count because the constant is extended for free below.
55830 // A constant operand can also become the displacement field of an LEA.
55831 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55832 if (!AddOp1C)
55833 return SDValue();
55834
55835 // Don't make the 'add' bigger if there's no hope of combining it with some
55836 // other 'add' or 'shl' instruction.
55837 // TODO: It may be profitable to generate simpler LEA instructions in place
55838 // of single 'add' instructions, but the cost model for selecting an LEA
55839 // currently has a high threshold.
55840 bool HasLEAPotential = false;
55841 for (auto *User : Ext->users()) {
55842 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55843 HasLEAPotential = true;
55844 break;
55845 }
55846 }
55847 if (!HasLEAPotential)
55848 return SDValue();
55849
55850 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55851 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55852 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55853 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55854
55855 // The wider add is guaranteed to not wrap because both operands are
55856 // sign-extended.
55857 SDNodeFlags Flags;
55858 Flags.setNoSignedWrap(NSW);
55859 Flags.setNoUnsignedWrap(NUW);
55860 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55861}
55862
55863// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55864// operands and the result of CMOV is not used anywhere else - promote CMOV
55865// itself instead of promoting its result. This could be beneficial, because:
55866// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55867// (or more) pseudo-CMOVs only when they go one-after-another and
55868// getting rid of result extension code after CMOV will help that.
55869// 2) Promotion of constant CMOV arguments is free, hence the
55870// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55871// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55872// promotion is also good in terms of code-size.
55873// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55874// promotion).
55876 SDValue CMovN = Extend->getOperand(0);
55877 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55878 return SDValue();
55879
55880 EVT TargetVT = Extend->getValueType(0);
55881 unsigned ExtendOpcode = Extend->getOpcode();
55882 SDLoc DL(Extend);
55883
55884 EVT VT = CMovN.getValueType();
55885 SDValue CMovOp0 = CMovN.getOperand(0);
55886 SDValue CMovOp1 = CMovN.getOperand(1);
55887
55888 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55889 !isa<ConstantSDNode>(CMovOp1.getNode()))
55890 return SDValue();
55891
55892 // Only extend to i32 or i64.
55893 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55894 return SDValue();
55895
55896 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55897 // are free.
55898 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55899 return SDValue();
55900
55901 // If this a zero extend to i64, we should only extend to i32 and use a free
55902 // zero extend to finish.
55903 EVT ExtendVT = TargetVT;
55904 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55905 ExtendVT = MVT::i32;
55906
55907 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55908 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55909
55910 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55911 CMovN.getOperand(2), CMovN.getOperand(3));
55912
55913 // Finish extending if needed.
55914 if (ExtendVT != TargetVT)
55915 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55916
55917 return Res;
55918}
55919
55920// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55921// result type.
55923 const X86Subtarget &Subtarget) {
55924 SDValue N0 = N->getOperand(0);
55925 EVT VT = N->getValueType(0);
55926 SDLoc dl(N);
55927
55928 // Only do this combine with AVX512 for vector extends.
55929 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55930 return SDValue();
55931
55932 // Only combine legal element types.
55933 EVT SVT = VT.getVectorElementType();
55934 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55935 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55936 return SDValue();
55937
55938 // We don't have CMPP Instruction for vxf16
55939 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55940 return SDValue();
55941 // We can only do this if the vector size in 256 bits or less.
55942 unsigned Size = VT.getSizeInBits();
55943 if (Size > 256 && Subtarget.useAVX512Regs())
55944 return SDValue();
55945
55946 EVT N00VT = N0.getOperand(0).getValueType();
55947
55948 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55949 // that's the only integer compares with we have.
55951 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55952 return SDValue();
55953
55954 // Only do this combine if the extension will be fully consumed by the setcc.
55955 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55956 if (Size != MatchingVecType.getSizeInBits())
55957 return SDValue();
55958
55959 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55960
55961 if (N->getOpcode() == ISD::ZERO_EXTEND)
55962 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55963
55964 return Res;
55965}
55966
55969 const X86Subtarget &Subtarget) {
55970 SDValue N0 = N->getOperand(0);
55971 EVT VT = N->getValueType(0);
55972 SDLoc DL(N);
55973
55974 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55975 if (!DCI.isBeforeLegalizeOps() &&
55977 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55978 N0->getOperand(1));
55979 bool ReplaceOtherUses = !N0.hasOneUse();
55980 DCI.CombineTo(N, Setcc);
55981 // Replace other uses with a truncate of the widened setcc_carry.
55982 if (ReplaceOtherUses) {
55983 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55984 N0.getValueType(), Setcc);
55985 DCI.CombineTo(N0.getNode(), Trunc);
55986 }
55987
55988 return SDValue(N, 0);
55989 }
55990
55991 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55992 return NewCMov;
55993
55994 if (!DCI.isBeforeLegalizeOps())
55995 return SDValue();
55996
55997 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55998 return V;
55999
56000 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
56001 DAG, DCI, Subtarget))
56002 return V;
56003
56004 if (VT.isVector()) {
56005 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
56006 return R;
56007
56009 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
56010 }
56011
56012 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56013 return NewAdd;
56014
56015 return SDValue();
56016}
56017
56018// Inverting a constant vector is profitable if it can be eliminated and the
56019// inverted vector is already present in DAG. Otherwise, it will be loaded
56020// anyway.
56021//
56022// We determine which of the values can be completely eliminated and invert it.
56023// If both are eliminable, select a vector with the first negative element.
56026 "ConstantFP build vector expected");
56027 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
56028 // can eliminate it. Since this function is invoked for each FMA with this
56029 // vector.
56030 auto IsNotFMA = [](SDNode *User) {
56031 return User->getOpcode() != ISD::FMA &&
56032 User->getOpcode() != ISD::STRICT_FMA;
56033 };
56034 if (llvm::any_of(V->users(), IsNotFMA))
56035 return SDValue();
56036
56038 EVT VT = V.getValueType();
56039 EVT EltVT = VT.getVectorElementType();
56040 for (const SDValue &Op : V->op_values()) {
56041 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56042 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56043 } else {
56044 assert(Op.isUndef());
56045 Ops.push_back(DAG.getUNDEF(EltVT));
56046 }
56047 }
56048
56050 if (!NV)
56051 return SDValue();
56052
56053 // If an inverted version cannot be eliminated, choose it instead of the
56054 // original version.
56055 if (llvm::any_of(NV->users(), IsNotFMA))
56056 return SDValue(NV, 0);
56057
56058 // If the inverted version also can be eliminated, we have to consistently
56059 // prefer one of the values. We prefer a constant with a negative value on
56060 // the first place.
56061 // N.B. We need to skip undefs that may precede a value.
56062 for (const SDValue &Op : V->op_values()) {
56063 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56064 if (Cst->isNegative())
56065 return SDValue();
56066 break;
56067 }
56068 }
56069 return SDValue(NV, 0);
56070}
56071
56074 const X86Subtarget &Subtarget) {
56075 SDLoc dl(N);
56076 EVT VT = N->getValueType(0);
56078 bool IsStrict = N->isTargetOpcode()
56079 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56080 : N->isStrictFPOpcode();
56081
56082 // Let legalize expand this if it isn't a legal type yet.
56083 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56084 if (!TLI.isTypeLegal(VT))
56085 return SDValue();
56086
56087 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56088 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56089 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56090
56091 // If the operation allows fast-math and the target does not support FMA,
56092 // split this into mul+add to avoid libcall(s).
56093 SDNodeFlags Flags = N->getFlags();
56094 if (!IsStrict && Flags.hasAllowReassociation() &&
56095 TLI.isOperationExpand(ISD::FMA, VT)) {
56096 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56097 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56098 }
56099
56100 EVT ScalarVT = VT.getScalarType();
56101 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56102 !Subtarget.hasAnyFMA()) &&
56103 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56104 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56105 return SDValue();
56106
56107 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56109 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56110 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56111 CodeSize)) {
56112 V = NegV;
56113 return true;
56114 }
56115 // Look through extract_vector_elts. If it comes from an FNEG, create a
56116 // new extract from the FNEG input.
56117 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56118 isNullConstant(V.getOperand(1))) {
56119 SDValue Vec = V.getOperand(0);
56120 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56121 Vec, DAG, LegalOperations, CodeSize)) {
56122 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56123 NegV, V.getOperand(1));
56124 return true;
56125 }
56126 }
56127 // Lookup if there is an inverted version of constant vector V in DAG.
56128 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56129 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56130 V = NegV;
56131 return true;
56132 }
56133 }
56134 return false;
56135 };
56136
56137 // Do not convert the passthru input of scalar intrinsics.
56138 // FIXME: We could allow negations of the lower element only.
56139 bool NegA = invertIfNegative(A);
56140 // Create a dummy use for A so that in the process of negating B or C
56141 // recursively, it is not deleted.
56142 HandleSDNode NegAHandle(A);
56143 bool NegB = invertIfNegative(B);
56144 // Similar to A, get a handle on B.
56145 HandleSDNode NegBHandle(B);
56146 bool NegC = invertIfNegative(C);
56147
56148 if (!NegA && !NegB && !NegC)
56149 return SDValue();
56150
56151 unsigned NewOpcode =
56152 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56153
56154 // Propagate fast-math-flags to new FMA node.
56155 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56156 if (IsStrict) {
56157 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56158 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56159 {N->getOperand(0), A, B, C});
56160 } else {
56161 if (N->getNumOperands() == 4)
56162 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56163 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56164 }
56165}
56166
56167// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56168// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56171 SDLoc dl(N);
56172 EVT VT = N->getValueType(0);
56173 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56175 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56176
56177 SDValue N2 = N->getOperand(2);
56178
56179 SDValue NegN2 =
56180 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56181 if (!NegN2)
56182 return SDValue();
56183 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56184
56185 if (N->getNumOperands() == 4)
56186 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56187 NegN2, N->getOperand(3));
56188 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56189 NegN2);
56190}
56191
56192// Try to widen the build vector and bitcast it to the type of zext.
56193// This is a special case for the 128-bit vector types. Intention is to remove
56194// the zext and replace it with a bitcast the wider type. While lowering
56195// the bitcast is removed and extra commutation due to zext is avoided.
56196// For example:
56197// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56198// build_vector (x, 0, y, 0, z, w, 0)
56200
56201 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56202 return SDValue();
56203
56204 EVT ExtendVT = Extend->getValueType(0);
56205
56206 SDValue BV = Extend->getOperand(0);
56207 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56208 return SDValue();
56209
56210 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56211 // If the build vector has undef elements, we cannot widen it.
56212 // The widening would create a vector with more undef elements, which
56213 // is not valid.
56214 return SDValue();
56215 }
56216
56217 if (!all_of(BV->op_values(),
56218 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56219 // If the build vector any element other than \ISD::LOAD, we cannot widen
56220 // it.
56221 return SDValue();
56222 }
56223
56224 SDLoc dl(BV);
56225 EVT VT = BV.getValueType();
56226 EVT EltVT = BV.getOperand(0).getValueType();
56227 unsigned NumElts = VT.getVectorNumElements();
56228
56229 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56230
56231 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56233 return SDValue();
56234
56235 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56236 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56237
56238 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56239 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56240 // Fill the new elements with Zero.
56241 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56242 // Compute the step to place the elements in the right place and control the
56243 // iteration.
56244 unsigned step = WidenNumElts / NumElts;
56245 if (WidenVT.is128BitVector()) {
56246 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56247 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56248 i--, j -= step) {
56249 SDValue temp = NewOps[i];
56250 NewOps[i] = NewOps[j];
56251 NewOps[j] = temp;
56252 }
56253 // Create new build vector with WidenVT and NewOps
56254 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56255 // Replace the old build vector with the new one. Bitcast the
56256 // new build vector to the type of the zext.
56257 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56258 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56259 return NewBV;
56260 }
56261 }
56262 return SDValue();
56263}
56264
56267 const X86Subtarget &Subtarget) {
56268 SDLoc dl(N);
56269 SDValue N0 = N->getOperand(0);
56270 EVT VT = N->getValueType(0);
56271
56272 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56273 // FIXME: Is this needed? We don't seem to have any tests for it.
56274 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56276 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56277 N0->getOperand(1));
56278 bool ReplaceOtherUses = !N0.hasOneUse();
56279 DCI.CombineTo(N, Setcc);
56280 // Replace other uses with a truncate of the widened setcc_carry.
56281 if (ReplaceOtherUses) {
56282 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56283 N0.getValueType(), Setcc);
56284 DCI.CombineTo(N0.getNode(), Trunc);
56285 }
56286
56287 return SDValue(N, 0);
56288 }
56289
56290 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56291 return NewCMov;
56292
56293 if (DCI.isBeforeLegalizeOps())
56294 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56295 return V;
56296
56297 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56298 DAG, DCI, Subtarget))
56299 return V;
56300
56301 if (VT.isVector())
56302 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56303 return R;
56304
56305 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56306 return NewAdd;
56307
56308 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56309 return R;
56310
56311 // TODO: Combine with any target/faux shuffle.
56312 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56314 SDValue N00 = N0.getOperand(0);
56315 SDValue N01 = N0.getOperand(1);
56316 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56317 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56318 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56319 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56320 return concatSubVectors(N00, N01, DAG, dl);
56321 }
56322 }
56323
56324 if (SDValue V = widenBuildVec(N, DAG))
56325 return V;
56326
56327 return SDValue();
56328}
56329
56330/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56331/// pre-promote its result type since vXi1 vectors don't get promoted
56332/// during type legalization.
56335 const SDLoc &DL, SelectionDAG &DAG,
56336 const X86Subtarget &Subtarget) {
56337 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56338 VT.getVectorElementType() == MVT::i1 &&
56339 (OpVT.getVectorElementType() == MVT::i8 ||
56340 OpVT.getVectorElementType() == MVT::i16)) {
56341 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56342 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56343 }
56344 return SDValue();
56345}
56346
56347// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56348// eq/ne) is generated when using an integer as a mask. Instead of generating a
56349// broadcast + vptest, we can directly move the integer to a mask register.
56351 const SDLoc &DL, SelectionDAG &DAG,
56352 const X86Subtarget &Subtarget) {
56353 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56354 return SDValue();
56355
56356 if (!Subtarget.hasAVX512())
56357 return SDValue();
56358
56359 if (Op0.getOpcode() != ISD::AND)
56360 return SDValue();
56361
56362 SDValue Broadcast = Op0.getOperand(0);
56363 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56364 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56365 return SDValue();
56366
56367 SDValue Load = Op0.getOperand(1);
56368 EVT LoadVT = Load.getSimpleValueType();
56369
56370 APInt UndefElts;
56371 SmallVector<APInt, 32> EltBits;
56373 UndefElts, EltBits,
56374 /*AllowWholeUndefs*/ true,
56375 /*AllowPartialUndefs*/ false) ||
56376 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56377 return SDValue();
56378
56379 // Check if the constant pool contains only powers of 2 starting from some
56380 // 2^N. The table may also contain undefs because of widening of vector
56381 // operands.
56382 unsigned N = EltBits[0].logBase2();
56383 unsigned Len = UndefElts.getBitWidth();
56384 for (unsigned I = 1; I != Len; ++I) {
56385 if (UndefElts[I]) {
56386 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56387 return SDValue();
56388 break;
56389 }
56390
56391 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56392 return SDValue();
56393 }
56394
56395 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56396 SDValue BroadcastOp;
56397 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56398 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56399 Broadcast, DAG.getVectorIdxConstant(0, DL));
56400 } else {
56401 BroadcastOp = Broadcast.getOperand(0);
56402 if (BroadcastOp.getValueType().isVector())
56403 return SDValue();
56404 }
56405
56406 SDValue Masked = BroadcastOp;
56407 if (N != 0) {
56408 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56409 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56410
56411 if (NumDefinedElts > BroadcastOpBitWidth)
56412 return SDValue();
56413
56414 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56415 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56416 DAG.getConstant(N, DL, BroadcastOpVT));
56417 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56418 DAG.getConstant(Mask, DL, BroadcastOpVT));
56419 }
56420 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56421 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56422 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56423 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56424
56425 if (CC == ISD::SETEQ)
56426 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56427
56428 if (VT != MVT::v16i1)
56429 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56430 DAG.getVectorIdxConstant(0, DL));
56431
56432 return Bitcast;
56433}
56434
56437 const X86Subtarget &Subtarget) {
56438 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56439 const SDValue LHS = N->getOperand(0);
56440 const SDValue RHS = N->getOperand(1);
56441 EVT VT = N->getValueType(0);
56442 EVT OpVT = LHS.getValueType();
56443 SDLoc DL(N);
56444
56445 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56446 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56447 Subtarget))
56448 return V;
56449 }
56450
56451 if (VT == MVT::i1) {
56452 X86::CondCode X86CC;
56453 if (SDValue V =
56454 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56455 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56456 }
56457
56458 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56459 if (OpVT.isScalarInteger()) {
56460 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56461 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56462 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56463 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56464 if (N0.getOperand(0) == N1)
56465 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56466 N0.getOperand(1));
56467 if (N0.getOperand(1) == N1)
56468 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56469 N0.getOperand(0));
56470 }
56471 return SDValue();
56472 };
56473 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56474 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56475 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56476 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56477
56478 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56479 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56480 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56481 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56482 if (N0.getOperand(0) == N1)
56483 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56484 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56485 if (N0.getOperand(1) == N1)
56486 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56487 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56488 }
56489 return SDValue();
56490 };
56491 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56492 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56493 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56494 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56495
56496 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56497 // cmpne(trunc(x),C) --> cmpne(x,C)
56498 // iff x upper bits are zero.
56499 if (LHS.getOpcode() == ISD::TRUNCATE &&
56500 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56502 EVT SrcVT = LHS.getOperand(0).getValueType();
56504 OpVT.getScalarSizeInBits());
56505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56506 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56507 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56508 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56509 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56510 }
56511
56512 // With C as a power of 2 and C != 0 and C != INT_MIN:
56513 // icmp eq Abs(X) C ->
56514 // (icmp eq A, C) | (icmp eq A, -C)
56515 // icmp ne Abs(X) C ->
56516 // (icmp ne A, C) & (icmp ne A, -C)
56517 // Both of these patterns can be better optimized in
56518 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56519 // integers which is checked above.
56520 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56521 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56522 const APInt &CInt = C->getAPIntValue();
56523 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56524 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56525 SDValue BaseOp = LHS.getOperand(0);
56526 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56527 SDValue SETCC1 = DAG.getSetCC(
56528 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56529 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56530 SETCC0, SETCC1);
56531 }
56532 }
56533 }
56534 }
56535 }
56536
56537 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56538 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56539 // Using temporaries to avoid messing up operand ordering for later
56540 // transformations if this doesn't work.
56541 SDValue Op0 = LHS;
56542 SDValue Op1 = RHS;
56543 ISD::CondCode TmpCC = CC;
56544 // Put build_vector on the right.
56545 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56546 std::swap(Op0, Op1);
56547 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56548 }
56549
56550 bool IsSEXT0 =
56551 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56552 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56553 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56554
56555 if (IsSEXT0 && IsVZero1) {
56556 assert(VT == Op0.getOperand(0).getValueType() &&
56557 "Unexpected operand type");
56558 if (TmpCC == ISD::SETGT)
56559 return DAG.getConstant(0, DL, VT);
56560 if (TmpCC == ISD::SETLE)
56561 return DAG.getConstant(1, DL, VT);
56562 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56563 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56564
56565 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56566 "Unexpected condition code!");
56567 return Op0.getOperand(0);
56568 }
56569
56570 if (IsVZero1)
56571 if (SDValue V =
56572 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56573 return V;
56574 }
56575
56576 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56577 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56578 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56579 // a mask, there are signed AVX512 comparisons).
56580 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56581 bool CanMakeSigned = false;
56582 if (ISD::isUnsignedIntSetCC(CC)) {
56583 KnownBits CmpKnown =
56585 // If we know LHS/RHS share the same sign bit at each element we can
56586 // make this signed.
56587 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56588 // across all lanes. So a pattern where the sign varies from lane to
56589 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56590 // missed. We could get around this by demanding each lane
56591 // independently, but this isn't the most important optimization and
56592 // that may eat into compile time.
56593 CanMakeSigned =
56594 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56595 }
56596 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56597 SDValue LHSOut = LHS;
56598 SDValue RHSOut = RHS;
56599 ISD::CondCode NewCC = CC;
56600 switch (CC) {
56601 case ISD::SETGE:
56602 case ISD::SETUGE:
56603 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56604 /*NSW*/ true))
56605 LHSOut = NewLHS;
56606 else if (SDValue NewRHS = incDecVectorConstant(
56607 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56608 RHSOut = NewRHS;
56609 else
56610 break;
56611
56612 [[fallthrough]];
56613 case ISD::SETUGT:
56614 NewCC = ISD::SETGT;
56615 break;
56616
56617 case ISD::SETLE:
56618 case ISD::SETULE:
56619 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56620 /*NSW*/ true))
56621 LHSOut = NewLHS;
56622 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56623 /*NSW*/ true))
56624 RHSOut = NewRHS;
56625 else
56626 break;
56627
56628 [[fallthrough]];
56629 case ISD::SETULT:
56630 // Will be swapped to SETGT in LowerVSETCC*.
56631 NewCC = ISD::SETLT;
56632 break;
56633 default:
56634 break;
56635 }
56636 if (NewCC != CC) {
56637 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56638 NewCC, DL, DAG, Subtarget))
56639 return R;
56640 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56641 }
56642 }
56643 }
56644
56645 if (SDValue R =
56646 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56647 return R;
56648
56649 // In the middle end transforms:
56650 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56651 // -> `(icmp ult (add x, -C), 2)`
56652 // Likewise inverted cases with `ugt`.
56653 //
56654 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56655 // in worse codegen. So, undo the middle-end transform and go back to `(or
56656 // (icmp eq), (icmp eq))` form.
56657 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56658 // the xmm approach.
56659 //
56660 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56661 // ne))` as it doesn't end up instruction positive.
56662 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56663 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56664 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56665 !Subtarget.hasAVX512() &&
56666 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56667 Subtarget.hasAVX2()) &&
56668 LHS.hasOneUse()) {
56669
56670 APInt CmpC;
56671 SDValue AddC = LHS.getOperand(1);
56672 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56674 // See which form we have depending on the constant/condition.
56675 SDValue C0 = SDValue();
56676 SDValue C1 = SDValue();
56677
56678 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56679 // we will end up generating an additional constant. Keeping in the
56680 // current form has a slight latency cost, but it probably worth saving a
56681 // constant.
56684 // Pass
56685 }
56686 // Normal Cases
56687 else if ((CC == ISD::SETULT && CmpC == 2) ||
56688 (CC == ISD::SETULE && CmpC == 1)) {
56689 // These will constant fold.
56690 C0 = DAG.getNegative(AddC, DL, OpVT);
56691 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56692 DAG.getAllOnesConstant(DL, OpVT));
56693 }
56694 // Inverted Cases
56695 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56696 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56697 // These will constant fold.
56698 C0 = DAG.getNOT(DL, AddC, OpVT);
56699 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56700 DAG.getAllOnesConstant(DL, OpVT));
56701 }
56702 if (C0 && C1) {
56703 SDValue NewLHS =
56704 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56705 SDValue NewRHS =
56706 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56707 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56708 }
56709 }
56710 }
56711
56712 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56713 // to avoid scalarization via legalization because v4i32 is not a legal type.
56714 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56715 LHS.getValueType() == MVT::v4f32)
56716 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56717
56718 // X pred 0.0 --> X pred -X
56719 // If the negation of X already exists, use it in the comparison. This removes
56720 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56721 // instructions in patterns with a 'select' node.
56723 SDVTList FNegVT = DAG.getVTList(OpVT);
56724 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56725 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56726 }
56727
56728 return SDValue();
56729}
56730
56733 const X86Subtarget &Subtarget) {
56734 SDValue Src = N->getOperand(0);
56735 MVT SrcVT = Src.getSimpleValueType();
56736 MVT VT = N->getSimpleValueType(0);
56737 unsigned NumBits = VT.getScalarSizeInBits();
56738 unsigned NumElts = SrcVT.getVectorNumElements();
56739 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56740 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56741
56742 // Perform constant folding.
56743 APInt UndefElts;
56744 SmallVector<APInt, 32> EltBits;
56745 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56746 /*AllowWholeUndefs*/ true,
56747 /*AllowPartialUndefs*/ true)) {
56748 APInt Imm(32, 0);
56749 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56750 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56751 Imm.setBit(Idx);
56752
56753 return DAG.getConstant(Imm, SDLoc(N), VT);
56754 }
56755
56756 // Look through int->fp bitcasts that don't change the element width.
56757 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56758 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56759 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56760 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56761
56762 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56763 // with scalar comparisons.
56764 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56765 SDLoc DL(N);
56766 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56767 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56768 return DAG.getNode(ISD::XOR, DL, VT,
56769 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56770 DAG.getConstant(NotMask, DL, VT));
56771 }
56772
56773 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56774 // results with scalar comparisons.
56775 if (Src.getOpcode() == X86ISD::PCMPGT &&
56776 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56777 SDLoc DL(N);
56778 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56779 return DAG.getNode(ISD::XOR, DL, VT,
56780 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56781 DAG.getConstant(NotMask, DL, VT));
56782 }
56783
56784 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56785 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56786 // iff pow2splat(c1).
56787 // Use KnownBits to determine if only a single bit is non-zero
56788 // in each element (pow2 or zero), and shift that bit to the msb.
56789 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56790 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56791 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56792 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56793 if (KnownLHS.countMaxPopulation() == 1 &&
56794 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56795 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56796 SDLoc DL(N);
56797 MVT ShiftVT = SrcVT;
56798 SDValue ShiftLHS = Src.getOperand(0);
56799 SDValue ShiftRHS = Src.getOperand(1);
56800 if (ShiftVT.getScalarType() == MVT::i8) {
56801 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56802 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56803 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56804 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56805 }
56806 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56807 ShiftLHS, ShiftAmt, DAG);
56808 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56809 ShiftRHS, ShiftAmt, DAG);
56810 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56811 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56812 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56813 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56814 }
56815 }
56816
56817 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56818 if (N->isOnlyUserOf(Src.getNode())) {
56820 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56821 APInt UndefElts;
56822 SmallVector<APInt, 32> EltBits;
56823 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56824 UndefElts, EltBits)) {
56825 APInt Mask = APInt::getZero(NumBits);
56826 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56827 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56828 Mask.setBit(Idx);
56829 }
56830 SDLoc DL(N);
56831 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56832 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56833 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56834 DAG.getConstant(Mask, DL, VT));
56835 }
56836 }
56837 }
56838
56839 // Simplify the inputs.
56840 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56841 APInt DemandedMask(APInt::getAllOnes(NumBits));
56842 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56843 return SDValue(N, 0);
56844
56845 return SDValue();
56846}
56847
56850 const X86Subtarget &Subtarget) {
56851 MVT VT = N->getSimpleValueType(0);
56852 unsigned NumBits = VT.getScalarSizeInBits();
56853
56854 // Simplify the inputs.
56855 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56856 APInt DemandedMask(APInt::getAllOnes(NumBits));
56857 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56858 return SDValue(N, 0);
56859
56860 return SDValue();
56861}
56862
56866 SDValue Mask = MemOp->getMask();
56867
56868 // With vector masks we only demand the upper bit of the mask.
56869 if (Mask.getScalarValueSizeInBits() != 1) {
56870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56871 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56872 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56873 if (N->getOpcode() != ISD::DELETED_NODE)
56874 DCI.AddToWorklist(N);
56875 return SDValue(N, 0);
56876 }
56877 }
56878
56879 return SDValue();
56880}
56881
56883 SDValue Index, SDValue Base, SDValue Scale,
56884 SelectionDAG &DAG) {
56885 SDLoc DL(GorS);
56886
56887 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56888 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56889 Gather->getMask(), Base, Index, Scale } ;
56890 return DAG.getMaskedGather(Gather->getVTList(),
56891 Gather->getMemoryVT(), DL, Ops,
56892 Gather->getMemOperand(),
56893 Gather->getIndexType(),
56894 Gather->getExtensionType());
56895 }
56896 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56897 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56898 Scatter->getMask(), Base, Index, Scale };
56899 return DAG.getMaskedScatter(Scatter->getVTList(),
56900 Scatter->getMemoryVT(), DL,
56901 Ops, Scatter->getMemOperand(),
56902 Scatter->getIndexType(),
56903 Scatter->isTruncatingStore());
56904}
56905
56908 SDLoc DL(N);
56909 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56910 SDValue Index = GorS->getIndex();
56911 SDValue Base = GorS->getBasePtr();
56912 SDValue Scale = GorS->getScale();
56913 EVT IndexVT = Index.getValueType();
56914 EVT IndexSVT = IndexVT.getVectorElementType();
56915 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56917 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56918
56919 if (DCI.isBeforeLegalize()) {
56920 // Attempt to move shifted index into the address scale, allows further
56921 // index truncation below.
56922 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56923 isa<ConstantSDNode>(Scale)) {
56924 unsigned ScaleAmt = Scale->getAsZExtVal();
56925 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56926 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56927 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56928 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56929 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56930 if (N->getOpcode() != ISD::DELETED_NODE)
56931 DCI.AddToWorklist(N);
56932 return SDValue(N, 0);
56933 }
56934 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56935 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56936 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56937 SDValue ShAmt = Index.getOperand(1);
56938 SDValue NewShAmt =
56939 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56940 DAG.getConstant(1, DL, ShAmt.getValueType()));
56941 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56942 Index.getOperand(0), NewShAmt);
56943 SDValue NewScale =
56944 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56945 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56946 }
56947 }
56948 }
56949
56950 // Shrink indices if they are larger than 32-bits.
56951 // Only do this before legalize types since v2i64 could become v2i32.
56952 // FIXME: We could check that the type is legal if we're after legalize
56953 // types, but then we would need to construct test cases where that happens.
56954 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56955 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56956
56957 // FIXME: We could support more than just constant fold, but we need to
56958 // careful with costing. A truncate that can be optimized out would be
56959 // fine. Otherwise we might only want to create a truncate if it avoids
56960 // a split.
56961 if (SDValue TruncIndex =
56962 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56963 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56964
56965 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56966 // there are sufficient sign bits. Only do this before legalize types to
56967 // avoid creating illegal types in truncate.
56968 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56969 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56970 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56971 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56972 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56973 }
56974
56975 // Shrink if we remove an illegal type.
56976 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56977 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56978 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56979 }
56980 }
56981 }
56982
56983 // Try to move splat adders from the index operand to the base
56984 // pointer operand. Taking care to multiply by the scale. We can only do
56985 // this when index element type is the same as the pointer type.
56986 // Otherwise we need to be sure the math doesn't wrap before the scale.
56987 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56988 isa<ConstantSDNode>(Scale)) {
56989 uint64_t ScaleAmt = Scale->getAsZExtVal();
56990
56991 for (unsigned I = 0; I != 2; ++I)
56992 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56993 BitVector UndefElts;
56994 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56995 if (UndefElts.none()) {
56996 // If the splat value is constant we can add the scaled splat value
56997 // to the existing base.
56998 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56999 APInt Adder = C->getAPIntValue() * ScaleAmt;
57000 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
57001 DAG.getConstant(Adder, DL, PtrVT));
57002 SDValue NewIndex = Index.getOperand(1 - I);
57003 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57004 }
57005 // For non-constant cases, limit this to non-scaled cases.
57006 if (ScaleAmt == 1) {
57007 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
57008 SDValue NewIndex = Index.getOperand(1 - I);
57009 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57010 }
57011 }
57012 }
57013 // It's also possible base is just a constant. In that case, just
57014 // replace it with 0 and move the displacement into the index.
57015 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
57016 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
57017 // Combine the constant build_vector and the constant base.
57018 Splat =
57019 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
57020 // Add to the other half of the original Index add.
57021 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
57022 Index.getOperand(1 - I), Splat);
57023 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
57024 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57025 }
57026 }
57027 }
57028
57029 if (DCI.isBeforeLegalizeOps()) {
57030 // Make sure the index is either i32 or i64
57031 if (IndexWidth != 32 && IndexWidth != 64) {
57032 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57033 IndexVT = IndexVT.changeVectorElementType(EltVT);
57034 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57035 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57036 }
57037 }
57038
57039 // With vector masks we only demand the upper bit of the mask.
57040 SDValue Mask = GorS->getMask();
57041 if (Mask.getScalarValueSizeInBits() != 1) {
57042 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57043 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57044 if (N->getOpcode() != ISD::DELETED_NODE)
57045 DCI.AddToWorklist(N);
57046 return SDValue(N, 0);
57047 }
57048 }
57049
57050 return SDValue();
57051}
57052
57053// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57055 const X86Subtarget &Subtarget) {
57056 SDLoc DL(N);
57057 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57058 SDValue EFLAGS = N->getOperand(1);
57059
57060 // Try to simplify the EFLAGS and condition code operands.
57061 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57062 return getSETCC(CC, Flags, DL, DAG);
57063
57064 return SDValue();
57065}
57066
57067/// Optimize branch condition evaluation.
57069 const X86Subtarget &Subtarget) {
57070 SDLoc DL(N);
57071 SDValue EFLAGS = N->getOperand(3);
57072 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57073
57074 // Try to simplify the EFLAGS and condition code operands.
57075 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57076 // RAUW them under us.
57077 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57078 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57079 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57080 N->getOperand(1), Cond, Flags);
57081 }
57082
57083 return SDValue();
57084}
57085
57086// TODO: Could we move this to DAGCombine?
57088 SelectionDAG &DAG) {
57089 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57090 // to optimize away operation when it's from a constant.
57091 //
57092 // The general transformation is:
57093 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57094 // AND(VECTOR_CMP(x,y), constant2)
57095 // constant2 = UNARYOP(constant)
57096
57097 // Early exit if this isn't a vector operation, the operand of the
57098 // unary operation isn't a bitwise AND, or if the sizes of the operations
57099 // aren't the same.
57100 EVT VT = N->getValueType(0);
57101 bool IsStrict = N->isStrictFPOpcode();
57102 unsigned NumEltBits = VT.getScalarSizeInBits();
57103 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57104 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57105 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57106 VT.getSizeInBits() != Op0.getValueSizeInBits())
57107 return SDValue();
57108
57109 // Now check that the other operand of the AND is a constant. We could
57110 // make the transformation for non-constant splats as well, but it's unclear
57111 // that would be a benefit as it would not eliminate any operations, just
57112 // perform one more step in scalar code before moving to the vector unit.
57113 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57114 // Bail out if the vector isn't a constant.
57115 if (!BV->isConstant())
57116 return SDValue();
57117
57118 // Everything checks out. Build up the new and improved node.
57119 SDLoc DL(N);
57120 EVT IntVT = BV->getValueType(0);
57121 // Create a new constant of the appropriate type for the transformed
57122 // DAG.
57123 SDValue SourceConst;
57124 if (IsStrict)
57125 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57126 {N->getOperand(0), SDValue(BV, 0)});
57127 else
57128 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57129 // The AND node needs bitcasts to/from an integer vector type around it.
57130 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57131 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57132 MaskConst);
57133 SDValue Res = DAG.getBitcast(VT, NewAnd);
57134 if (IsStrict)
57135 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57136 return Res;
57137 }
57138
57139 return SDValue();
57140}
57141
57142/// If we are converting a value to floating-point, try to replace scalar
57143/// truncate of an extracted vector element with a bitcast. This tries to keep
57144/// the sequence on XMM registers rather than moving between vector and GPRs.
57146 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57147 // to allow being called by any similar cast opcode.
57148 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57149 SDValue Trunc = N->getOperand(0);
57150 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57151 return SDValue();
57152
57153 SDValue ExtElt = Trunc.getOperand(0);
57154 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57155 !isNullConstant(ExtElt.getOperand(1)))
57156 return SDValue();
57157
57158 EVT TruncVT = Trunc.getValueType();
57159 EVT SrcVT = ExtElt.getValueType();
57160 unsigned DestWidth = TruncVT.getSizeInBits();
57161 unsigned SrcWidth = SrcVT.getSizeInBits();
57162 if (SrcWidth % DestWidth != 0)
57163 return SDValue();
57164
57165 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57166 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57167 unsigned VecWidth = SrcVecVT.getSizeInBits();
57168 unsigned NumElts = VecWidth / DestWidth;
57169 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57170 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57171 SDLoc DL(N);
57172 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57173 BitcastVec, ExtElt.getOperand(1));
57174 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57175}
57176
57178 const X86Subtarget &Subtarget) {
57179 bool IsStrict = N->isStrictFPOpcode();
57180 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57181 EVT VT = N->getValueType(0);
57182 EVT InVT = Op0.getValueType();
57183
57184 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57185 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57186 // if hasFP16 support:
57187 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57188 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57189 // else
57190 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57191 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57192 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57193 unsigned ScalarSize = InVT.getScalarSizeInBits();
57194 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57195 ScalarSize >= 64)
57196 return SDValue();
57197 SDLoc dl(N);
57198 EVT DstVT =
57200 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57201 : ScalarSize < 32 ? MVT::i32
57202 : MVT::i64,
57203 InVT.getVectorNumElements());
57204 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57205 if (IsStrict)
57206 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57207 {N->getOperand(0), P});
57208 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57209 }
57210
57211 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57212 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57213 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57214 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57215 VT.getScalarType() != MVT::f16) {
57216 SDLoc dl(N);
57217 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57218 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57219
57220 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57221 if (IsStrict)
57222 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57223 {N->getOperand(0), P});
57224 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57225 }
57226
57227 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57228 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57229 // the optimization here.
57230 SDNodeFlags Flags = N->getFlags();
57231 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57232 if (IsStrict)
57233 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57234 {N->getOperand(0), Op0});
57235 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57236 }
57237
57238 return SDValue();
57239}
57240
57243 const X86Subtarget &Subtarget) {
57244 // First try to optimize away the conversion entirely when it's
57245 // conditionally from a constant. Vectors only.
57246 bool IsStrict = N->isStrictFPOpcode();
57248 return Res;
57249
57250 // Now move on to more general possibilities.
57251 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57252 EVT VT = N->getValueType(0);
57253 EVT InVT = Op0.getValueType();
57254
57255 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57256 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57257 // if hasFP16 support:
57258 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57259 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57260 // else
57261 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57262 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57263 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57264 unsigned ScalarSize = InVT.getScalarSizeInBits();
57265 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57266 ScalarSize >= 64)
57267 return SDValue();
57268 SDLoc dl(N);
57269 EVT DstVT =
57271 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57272 : ScalarSize < 32 ? MVT::i32
57273 : MVT::i64,
57274 InVT.getVectorNumElements());
57275 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57276 if (IsStrict)
57277 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57278 {N->getOperand(0), P});
57279 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57280 }
57281
57282 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57283 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57284 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57285 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57286 VT.getScalarType() != MVT::f16) {
57287 SDLoc dl(N);
57288 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57289 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57290 if (IsStrict)
57291 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57292 {N->getOperand(0), P});
57293 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57294 }
57295
57296 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57297 // vectors and scalars, see if we know that the upper bits are all the sign
57298 // bit, in which case we can truncate the input to i32 and convert from that.
57299 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57300 unsigned BitWidth = InVT.getScalarSizeInBits();
57301 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57302 if (NumSignBits >= (BitWidth - 31)) {
57303 EVT TruncVT = MVT::i32;
57304 if (InVT.isVector())
57305 TruncVT = InVT.changeVectorElementType(TruncVT);
57306 SDLoc dl(N);
57307 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57308 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57309 if (IsStrict)
57310 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57311 {N->getOperand(0), Trunc});
57312 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57313 }
57314 // If we're after legalize and the type is v2i32 we need to shuffle and
57315 // use CVTSI2P.
57316 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57317 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57318 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57319 { 0, 2, -1, -1 });
57320 if (IsStrict)
57321 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57322 {N->getOperand(0), Shuf});
57323 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57324 }
57325 }
57326
57327 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57328 // a 32-bit target where SSE doesn't support i64->FP operations.
57329 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57330 Op0.getOpcode() == ISD::LOAD) {
57331 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57332
57333 // This transformation is not supported if the result type is f16 or f128.
57334 if (VT == MVT::f16 || VT == MVT::f128)
57335 return SDValue();
57336
57337 // If we have AVX512DQ we can use packed conversion instructions unless
57338 // the VT is f80.
57339 if (Subtarget.hasDQI() && VT != MVT::f80)
57340 return SDValue();
57341
57342 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57343 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57344 std::pair<SDValue, SDValue> Tmp =
57345 Subtarget.getTargetLowering()->BuildFILD(
57346 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57347 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57348 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57349 return Tmp.first;
57350 }
57351 }
57352
57353 if (IsStrict)
57354 return SDValue();
57355
57356 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57357 return V;
57358
57359 return SDValue();
57360}
57361
57363 const X86Subtarget &Subtarget) {
57364 EVT VT = N->getValueType(0);
57365 SDValue Src = N->getOperand(0);
57366 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57367 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57368 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57369
57370 return SDValue();
57371}
57372
57373// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57375 const X86Subtarget &Subtarget) {
57376 if (!Subtarget.hasAVX10_2())
57377 return SDValue();
57378
57379 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57380 EVT SrcVT = N->getOperand(0).getValueType();
57381 EVT DstVT = N->getValueType(0);
57382 SDLoc dl(N);
57383
57384 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57385 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57386
57387 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57388 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57389 N->getOperand(0), V2F32Value);
57390
57391 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57392 if (IsSigned)
57393 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57394
57395 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57396 }
57397 return SDValue();
57398}
57399
57401 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57402
57403 for (const SDNode *User : Flags->users()) {
57404 X86::CondCode CC;
57405 switch (User->getOpcode()) {
57406 default:
57407 // Be conservative.
57408 return true;
57409 case X86ISD::SETCC:
57411 CC = (X86::CondCode)User->getConstantOperandVal(0);
57412 break;
57413 case X86ISD::BRCOND:
57414 case X86ISD::CMOV:
57415 CC = (X86::CondCode)User->getConstantOperandVal(2);
57416 break;
57417 }
57418
57419 switch (CC) {
57420 // clang-format off
57421 default: break;
57422 case X86::COND_A: case X86::COND_AE:
57423 case X86::COND_B: case X86::COND_BE:
57424 case X86::COND_O: case X86::COND_NO:
57425 case X86::COND_G: case X86::COND_GE:
57426 case X86::COND_L: case X86::COND_LE:
57427 return true;
57428 // clang-format on
57429 }
57430 }
57431
57432 return false;
57433}
57434
57435static bool onlyZeroFlagUsed(SDValue Flags) {
57436 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57437
57438 for (const SDNode *User : Flags->users()) {
57439 unsigned CCOpNo;
57440 switch (User->getOpcode()) {
57441 default:
57442 // Be conservative.
57443 return false;
57444 case X86ISD::SETCC:
57446 CCOpNo = 0;
57447 break;
57448 case X86ISD::BRCOND:
57449 case X86ISD::CMOV:
57450 CCOpNo = 2;
57451 break;
57452 }
57453
57454 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57455 if (CC != X86::COND_E && CC != X86::COND_NE)
57456 return false;
57457 }
57458
57459 return true;
57460}
57461
57464 const X86Subtarget &Subtarget) {
57465 // Only handle test patterns.
57466 if (!isNullConstant(N->getOperand(1)))
57467 return SDValue();
57468
57469 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57470 // and use its flags directly.
57471 // TODO: Maybe we should try promoting compares that only use the zero flag
57472 // first if we can prove the upper bits with computeKnownBits?
57473 SDLoc dl(N);
57474 SDValue Op = N->getOperand(0);
57475 EVT VT = Op.getValueType();
57476 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57477
57478 if (SDValue CMP =
57479 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57480 return CMP;
57481
57482 // If we have a constant logical shift that's only used in a comparison
57483 // against zero turn it into an equivalent AND. This allows turning it into
57484 // a TEST instruction later.
57485 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57486 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57487 onlyZeroFlagUsed(SDValue(N, 0))) {
57488 unsigned BitWidth = VT.getSizeInBits();
57489 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57490 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57491 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57492 APInt Mask = Op.getOpcode() == ISD::SRL
57493 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57494 : APInt::getLowBitsSet(BitWidth, MaskBits);
57495 if (Mask.isSignedIntN(32)) {
57496 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57497 DAG.getConstant(Mask, dl, VT));
57498 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57499 DAG.getConstant(0, dl, VT));
57500 }
57501 }
57502 }
57503
57504 // If we're extracting from a avx512 bool vector and comparing against zero,
57505 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57506 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57507 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57508 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57509 SDValue Src = Op.getOperand(0);
57510 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57511 isNullConstant(Src.getOperand(1)) &&
57512 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57513 SDValue BoolVec = Src.getOperand(0);
57514 unsigned ShAmt = 0;
57515 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57516 ShAmt = BoolVec.getConstantOperandVal(1);
57517 BoolVec = BoolVec.getOperand(0);
57518 }
57519 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57520 EVT VecVT = BoolVec.getValueType();
57521 unsigned BitWidth = VecVT.getVectorNumElements();
57522 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57523 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57524 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57525 Op = DAG.getBitcast(BCVT, BoolVec);
57526 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57527 DAG.getConstant(Mask, dl, BCVT));
57528 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57529 DAG.getConstant(0, dl, BCVT));
57530 }
57531 }
57532 }
57533
57534 // Peek through any zero-extend if we're only testing for a zero result.
57535 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57536 SDValue Src = Op.getOperand(0);
57537 EVT SrcVT = Src.getValueType();
57538 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57539 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57540 DAG.getConstant(0, dl, SrcVT));
57541 }
57542
57543 // Look for a truncate.
57544 if (Op.getOpcode() != ISD::TRUNCATE)
57545 return SDValue();
57546
57547 SDValue Trunc = Op;
57548 Op = Op.getOperand(0);
57549
57550 // See if we can compare with zero against the truncation source,
57551 // which should help using the Z flag from many ops. Only do this for
57552 // i32 truncated op to prevent partial-reg compares of promoted ops.
57553 EVT OpVT = Op.getValueType();
57554 APInt UpperBits =
57556 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57557 onlyZeroFlagUsed(SDValue(N, 0))) {
57558 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57559 DAG.getConstant(0, dl, OpVT));
57560 }
57561
57562 // After this the truncate and arithmetic op must have a single use.
57563 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57564 return SDValue();
57565
57566 unsigned NewOpc;
57567 switch (Op.getOpcode()) {
57568 default: return SDValue();
57569 case ISD::AND:
57570 // Skip and with constant. We have special handling for and with immediate
57571 // during isel to generate test instructions.
57572 if (isa<ConstantSDNode>(Op.getOperand(1)))
57573 return SDValue();
57574 NewOpc = X86ISD::AND;
57575 break;
57576 case ISD::OR: NewOpc = X86ISD::OR; break;
57577 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57578 case ISD::ADD:
57579 // If the carry or overflow flag is used, we can't truncate.
57581 return SDValue();
57582 NewOpc = X86ISD::ADD;
57583 break;
57584 case ISD::SUB:
57585 // If the carry or overflow flag is used, we can't truncate.
57587 return SDValue();
57588 NewOpc = X86ISD::SUB;
57589 break;
57590 }
57591
57592 // We found an op we can narrow. Truncate its inputs.
57593 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57594 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57595
57596 // Use a X86 specific opcode to avoid DAG combine messing with it.
57597 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57598 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57599
57600 // For AND, keep a CMP so that we can match the test pattern.
57601 if (NewOpc == X86ISD::AND)
57602 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57603 DAG.getConstant(0, dl, VT));
57604
57605 // Return the flags.
57606 return Op.getValue(1);
57607}
57608
57611 const X86Subtarget &ST) {
57612 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57613 "Expected X86ISD::ADD or X86ISD::SUB");
57614
57615 SDLoc DL(N);
57616 SDValue LHS = N->getOperand(0);
57617 SDValue RHS = N->getOperand(1);
57618 MVT VT = LHS.getSimpleValueType();
57619 bool IsSub = X86ISD::SUB == N->getOpcode();
57620 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57621
57622 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57623 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57624 return CMP;
57625
57626 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57627 if (!N->hasAnyUseOfValue(1)) {
57628 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57629 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57630 }
57631
57632 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57633 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57634 SDValue Ops[] = {N0, N1};
57635 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57636 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57637 SDValue Op(N, 0);
57638 if (Negate) {
57639 // Bail if this is only used by a user of the x86 add/sub.
57640 if (GenericAddSub->hasOneUse() &&
57641 GenericAddSub->user_begin()->isOnlyUserOf(N))
57642 return;
57643 Op = DAG.getNegative(Op, DL, VT);
57644 }
57645 DCI.CombineTo(GenericAddSub, Op);
57646 }
57647 };
57648 MatchGeneric(LHS, RHS, false);
57649 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57650
57651 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57652 // EFLAGS result doesn't change.
57653 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57654 /*ZeroSecondOpOnly*/ true);
57655}
57656
57658 SDValue LHS = N->getOperand(0);
57659 SDValue RHS = N->getOperand(1);
57660 SDValue BorrowIn = N->getOperand(2);
57661
57662 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57663 MVT VT = N->getSimpleValueType(0);
57664 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57665 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57666 }
57667
57668 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57669 // iff the flag result is dead.
57670 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57671 !N->hasAnyUseOfValue(1))
57672 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57673 LHS.getOperand(1), BorrowIn);
57674
57675 return SDValue();
57676}
57677
57678// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57681 SDValue LHS = N->getOperand(0);
57682 SDValue RHS = N->getOperand(1);
57683 SDValue CarryIn = N->getOperand(2);
57684 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57685 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57686
57687 // Canonicalize constant to RHS.
57688 if (LHSC && !RHSC)
57689 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57690 CarryIn);
57691
57692 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57693 // the result is either zero or one (depending on the input carry bit).
57694 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57695 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57696 // We don't have a good way to replace an EFLAGS use, so only do this when
57697 // dead right now.
57698 SDValue(N, 1).use_empty()) {
57699 SDLoc DL(N);
57700 EVT VT = N->getValueType(0);
57701 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57702 SDValue Res1 = DAG.getNode(
57703 ISD::AND, DL, VT,
57705 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57706 DAG.getConstant(1, DL, VT));
57707 return DCI.CombineTo(N, Res1, CarryOut);
57708 }
57709
57710 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57711 // iff the flag result is dead.
57712 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57713 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57714 SDLoc DL(N);
57715 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57716 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57717 DAG.getConstant(0, DL, LHS.getValueType()),
57718 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57719 }
57720
57721 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57722 MVT VT = N->getSimpleValueType(0);
57723 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57724 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57725 }
57726
57727 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57728 // iff the flag result is dead.
57729 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57730 !N->hasAnyUseOfValue(1))
57731 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57732 LHS.getOperand(1), CarryIn);
57733
57734 return SDValue();
57735}
57736
57738 const SDLoc &DL, EVT VT,
57739 const X86Subtarget &Subtarget) {
57740 using namespace SDPatternMatch;
57741
57742 // Example of pattern we try to detect:
57743 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57744 //(add (build_vector (extract_elt t, 0),
57745 // (extract_elt t, 2),
57746 // (extract_elt t, 4),
57747 // (extract_elt t, 6)),
57748 // (build_vector (extract_elt t, 1),
57749 // (extract_elt t, 3),
57750 // (extract_elt t, 5),
57751 // (extract_elt t, 7)))
57752
57753 if (!Subtarget.hasSSE2())
57754 return SDValue();
57755
57756 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57757 VT.getVectorNumElements() < 4 ||
57759 return SDValue();
57760
57761 SDValue Op0, Op1, Accum;
57766 m_Value(Op1))))))
57767 return SDValue();
57768
57769 // Check if one of Op0,Op1 is of the form:
57770 // (build_vector (extract_elt Mul, 0),
57771 // (extract_elt Mul, 2),
57772 // (extract_elt Mul, 4),
57773 // ...
57774 // the other is of the form:
57775 // (build_vector (extract_elt Mul, 1),
57776 // (extract_elt Mul, 3),
57777 // (extract_elt Mul, 5),
57778 // ...
57779 // and identify Mul.
57780 SDValue Mul;
57781 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57782 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57783 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57784 // TODO: Be more tolerant to undefs.
57785 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57786 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57787 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57788 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57789 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57790 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57791 return SDValue();
57792 // Commutativity of mul allows factors of a product to reorder.
57793 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57794 std::swap(Idx0L, Idx1L);
57795 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57796 std::swap(Idx0H, Idx1H);
57797 // Commutativity of add allows pairs of factors to reorder.
57798 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57799 std::swap(Idx0L, Idx0H);
57800 std::swap(Idx1L, Idx1H);
57801 }
57802 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57803 Idx1H != 2 * i + 3)
57804 return SDValue();
57805 if (!Mul) {
57806 // First time an extract_elt's source vector is visited. Must be a MUL
57807 // with 2X number of vector elements than the BUILD_VECTOR.
57808 // Both extracts must be from same MUL.
57809 Mul = Vec0L;
57810 if (Mul.getOpcode() != ISD::MUL ||
57811 Mul.getValueType().getVectorNumElements() != 2 * e)
57812 return SDValue();
57813 }
57814 // Check that the extract is from the same MUL previously seen.
57815 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57816 return SDValue();
57817 }
57818
57819 // Check if the Mul source can be safely shrunk.
57821 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57823 return SDValue();
57824
57825 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57826 VT.getVectorNumElements() * 2);
57827 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57828 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57829
57830 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57832 EVT InVT = Ops[0].getValueType();
57833 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57834 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57835 InVT.getVectorNumElements() / 2);
57836 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57837 };
57838 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57839 if (Accum)
57840 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57841 return R;
57842}
57843
57844// Attempt to turn this pattern into PMADDWD.
57845// (add (mul (sext (build_vector)), (sext (build_vector))),
57846// (mul (sext (build_vector)), (sext (build_vector)))
57848 const SDLoc &DL, EVT VT,
57849 const X86Subtarget &Subtarget) {
57850 using namespace SDPatternMatch;
57851
57852 if (!Subtarget.hasSSE2())
57853 return SDValue();
57854
57855 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57856 VT.getVectorNumElements() < 4 ||
57858 return SDValue();
57859
57860 // All inputs need to be sign extends.
57861 // TODO: Support ZERO_EXTEND from known positive?
57862 SDValue N00, N01, N10, N11;
57863 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57864 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57865 return SDValue();
57866
57867 // Must be extending from vXi16.
57868 EVT InVT = N00.getValueType();
57869 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57870 N10.getValueType() != InVT || N11.getValueType() != InVT)
57871 return SDValue();
57872
57873 // All inputs should be build_vectors.
57874 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57875 N01.getOpcode() != ISD::BUILD_VECTOR ||
57876 N10.getOpcode() != ISD::BUILD_VECTOR ||
57878 return SDValue();
57879
57880 // For each element, we need to ensure we have an odd element from one vector
57881 // multiplied by the odd element of another vector and the even element from
57882 // one of the same vectors being multiplied by the even element from the
57883 // other vector. So we need to make sure for each element i, this operator
57884 // is being performed:
57885 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57886 SDValue In0, In1;
57887 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57888 SDValue N00Elt = N00.getOperand(i);
57889 SDValue N01Elt = N01.getOperand(i);
57890 SDValue N10Elt = N10.getOperand(i);
57891 SDValue N11Elt = N11.getOperand(i);
57892 // TODO: Be more tolerant to undefs.
57893 SDValue N00In, N01In, N10In, N11In;
57894 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57895 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57896 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57897 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57898 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57899 return SDValue();
57900 // Add is commutative so indices can be reordered.
57901 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57902 std::swap(IdxN00, IdxN10);
57903 std::swap(IdxN01, IdxN11);
57904 }
57905 // N0 indices be the even element. N1 indices must be the next odd element.
57906 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57907 IdxN11 != 2 * i + 1)
57908 return SDValue();
57909
57910 // First time we find an input capture it.
57911 if (!In0) {
57912 In0 = N00In;
57913 In1 = N01In;
57914
57915 // The input vectors must be at least as wide as the output.
57916 // If they are larger than the output, we extract subvector below.
57917 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57918 In1.getValueSizeInBits() < VT.getSizeInBits())
57919 return SDValue();
57920 }
57921 // Mul is commutative so the input vectors can be in any order.
57922 // Canonicalize to make the compares easier.
57923 if (In0 != N00In)
57924 std::swap(N00In, N01In);
57925 if (In0 != N10In)
57926 std::swap(N10In, N11In);
57927 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57928 return SDValue();
57929 }
57930
57931 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57933 EVT OpVT = Ops[0].getValueType();
57934 assert(OpVT.getScalarType() == MVT::i16 &&
57935 "Unexpected scalar element type");
57936 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57937 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57938 OpVT.getVectorNumElements() / 2);
57939 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57940 };
57941
57942 // If the output is narrower than an input, extract the low part of the input
57943 // vector.
57944 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57945 VT.getVectorNumElements() * 2);
57946 if (OutVT16.bitsLT(In0.getValueType())) {
57947 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57948 DAG.getVectorIdxConstant(0, DL));
57949 }
57950 if (OutVT16.bitsLT(In1.getValueType())) {
57951 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57952 DAG.getVectorIdxConstant(0, DL));
57953 }
57954 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57955 PMADDBuilder);
57956}
57957
57958// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57959// If upper element in each pair of both VPMADDWD are zero then we can merge
57960// the operand elements and use the implicit add of VPMADDWD.
57961// TODO: Add support for VPMADDUBSW (which isn't commutable).
57963 const SDLoc &DL, EVT VT) {
57964 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57965 return SDValue();
57966
57967 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57968 if (VT.getSizeInBits() > 128)
57969 return SDValue();
57970
57971 unsigned NumElts = VT.getVectorNumElements();
57972 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57974 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57975
57976 bool Op0HiZero =
57977 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57978 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57979 bool Op1HiZero =
57980 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57981 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57982
57983 // TODO: Check for zero lower elements once we have actual codegen that
57984 // creates them.
57985 if (!Op0HiZero || !Op1HiZero)
57986 return SDValue();
57987
57988 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57989 SmallVector<int> Mask;
57990 for (int i = 0; i != (int)NumElts; ++i) {
57991 Mask.push_back(2 * i);
57992 Mask.push_back(2 * (i + NumElts));
57993 }
57994
57995 SDValue LHS =
57996 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57997 SDValue RHS =
57998 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57999 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
58000}
58001
58002/// CMOV of constants requires materializing constant operands in registers.
58003/// Try to fold those constants into an 'add' instruction to reduce instruction
58004/// count. We do this with CMOV rather the generic 'select' because there are
58005/// earlier folds that may be used to turn select-of-constants into logic hacks.
58007 SelectionDAG &DAG,
58008 const X86Subtarget &Subtarget) {
58009 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
58010 // better because we eliminate 1-2 instructions. This transform is still
58011 // an improvement without zero operands because we trade 2 move constants and
58012 // 1 add for 2 adds (LEA) as long as the constants can be represented as
58013 // immediate asm operands (fit in 32-bits).
58014 auto isSuitableCmov = [](SDValue V) {
58015 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
58016 return false;
58017 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
58018 !isa<ConstantSDNode>(V.getOperand(1)))
58019 return false;
58020 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
58021 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
58022 V.getConstantOperandAPInt(1).isSignedIntN(32));
58023 };
58024
58025 // Match an appropriate CMOV as the first operand of the add.
58026 SDValue Cmov = N->getOperand(0);
58027 SDValue OtherOp = N->getOperand(1);
58028 if (!isSuitableCmov(Cmov))
58029 std::swap(Cmov, OtherOp);
58030 if (!isSuitableCmov(Cmov))
58031 return SDValue();
58032
58033 // Don't remove a load folding opportunity for the add. That would neutralize
58034 // any improvements from removing constant materializations.
58035 if (X86::mayFoldLoad(OtherOp, Subtarget))
58036 return SDValue();
58037
58038 EVT VT = N->getValueType(0);
58039 SDValue FalseOp = Cmov.getOperand(0);
58040 SDValue TrueOp = Cmov.getOperand(1);
58041
58042 // We will push the add through the select, but we can potentially do better
58043 // if we know there is another add in the sequence and this is pointer math.
58044 // In that case, we can absorb an add into the trailing memory op and avoid
58045 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58046 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58047 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58048 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58049 all_of(N->users(), [&](SDNode *Use) {
58050 auto *MemNode = dyn_cast<MemSDNode>(Use);
58051 return MemNode && MemNode->getBasePtr().getNode() == N;
58052 })) {
58053 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58054 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58055 // it is possible that choosing op1 might be better.
58056 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58057 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58058 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58059 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58060 Cmov.getOperand(2), Cmov.getOperand(3));
58061 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58062 }
58063
58064 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58065 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58066 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58067 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58068 Cmov.getOperand(3));
58069}
58070
58071// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58072// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58074 EVT VT, const X86Subtarget &Subtarget) {
58075 using namespace SDPatternMatch;
58076 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58077 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58078 return SDValue();
58079
58080 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58081 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58082 VT.getSizeInBits() < 512)
58083 return SDValue();
58084
58085 const auto TotalSize = VT.getSizeInBits();
58086 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58087 return SDValue();
58088
58089 SDValue X, Y, Acc;
58090 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58091 return SDValue();
58092
58093 KnownBits KnownX = DAG.computeKnownBits(X);
58094 if (KnownX.countMinLeadingZeros() < 12)
58095 return SDValue();
58096 KnownBits KnownY = DAG.computeKnownBits(Y);
58097 if (KnownY.countMinLeadingZeros() < 12)
58098 return SDValue();
58099 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58100 if (KnownMul.countMinLeadingZeros() < 12)
58101 return SDValue();
58102
58103 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58104 ArrayRef<SDValue> SubOps) {
58105 EVT SubVT = SubOps[0].getValueType();
58106 assert(SubVT.getScalarSizeInBits() == 64 &&
58107 "Unexpected element size, only supports 64bit size");
58108 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58109 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58110 };
58111
58112 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58113 /*CheckBWI*/ false,
58114 /*AllowAVX512*/ Subtarget.hasIFMA());
58115}
58116
58119 const X86Subtarget &Subtarget) {
58120 using namespace SDPatternMatch;
58121 EVT VT = N->getValueType(0);
58122 SDValue Op0 = N->getOperand(0);
58123 SDValue Op1 = N->getOperand(1);
58124 SDLoc DL(N);
58125
58126 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58127 return Select;
58128
58129 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58130 return MAdd;
58131 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58132 return MAdd;
58133 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58134 return MAdd;
58135
58136 // Try to synthesize horizontal adds from adds of shuffles.
58137 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58138 return V;
58139
58140 // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
58141 // on the scheduler model. Limit multiple users to AVX+ targets to prevent
58142 // introducing extra register moves.
58143 if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
58144 if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
58146 Op0, 1, DAG);
58147
58148 // Canonicalize hidden LEA pattern:
58149 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58150 // iff c < 4
58151 if (VT == MVT::i32 || VT == MVT::i64) {
58152 SDValue Y, Z, Shift;
58153 APInt Amt;
58154 if (sd_match(
58156 m_Shl(m_Value(), m_ConstInt(Amt))),
58157 m_Value(Y))),
58158 m_Value(Z))) &&
58159 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58160 return DAG.getNode(ISD::SUB, DL, VT,
58161 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58162 }
58163 }
58164
58165 SDValue X, Y;
58166
58167 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58168 // iff X and Y won't overflow.
58169 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58171 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58172 MVT OpVT = X.getSimpleValueType();
58173 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58174 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58175 getZeroVector(OpVT, Subtarget, DAG, DL));
58176 }
58177
58178 if (VT.isVector()) {
58179 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58181
58182 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58183 // (sub Y, (sext (vXi1 X))).
58184 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58185 // in generic DAG combine without a legal type check, but adding this there
58186 // caused regressions.
58187 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58189 m_Value(Y)))) {
58190 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58191 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58192 }
58193
58194 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58195 // canonicalisation as we don't have good vXi8 shifts.
58196 if (VT.getScalarType() == MVT::i8 &&
58198 SDValue Cmp =
58199 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58200 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58201 }
58202 }
58203
58204 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58205 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58206 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58207 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58208 if (sd_match(N, m_Add(m_Value(Accum),
58211 m_Value(Lo1)),
58213 m_Value(Hi1)))))) {
58214 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58215 concatSubVectors(Lo0, Hi0, DAG, DL),
58216 concatSubVectors(Lo1, Hi1, DAG, DL));
58217 }
58218 }
58219
58220 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58221 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58222 X86::isZeroNode(Op0.getOperand(1))) {
58223 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58224 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58225 Op0.getOperand(0), Op0.getOperand(2));
58226 }
58227
58228 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58229 return IFMA52;
58230
58231 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58232}
58233
58234// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58235// condition comes from the subtract node that produced -X. This matches the
58236// cmov expansion for absolute value. By swapping the operands we convert abs
58237// to nabs.
58238static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58239 SelectionDAG &DAG) {
58240 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58241 return SDValue();
58242
58243 SDValue Cond = N1.getOperand(3);
58244 if (Cond.getOpcode() != X86ISD::SUB)
58245 return SDValue();
58246 assert(Cond.getResNo() == 1 && "Unexpected result number");
58247
58248 SDValue FalseOp = N1.getOperand(0);
58249 SDValue TrueOp = N1.getOperand(1);
58251
58252 // ABS condition should come from a negate operation.
58253 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58254 isNullConstant(Cond.getOperand(0))) {
58255 // Get the X and -X from the negate.
58256 SDValue NegX = Cond.getValue(0);
58257 SDValue X = Cond.getOperand(1);
58258
58259 // Cmov operands should be X and NegX. Order doesn't matter.
58260 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58261 return SDValue();
58262
58263 // Build a new CMOV with the operands swapped.
58264 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58265 N1.getOperand(2), Cond);
58266 // Convert sub to add.
58267 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58268 }
58269
58270 // Handle ABD special case:
58271 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58272 // ABD condition should come from a pair of matching subtracts.
58273 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58274 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58275 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58276 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58277 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58278 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58279 // Build a new CMOV with the operands swapped.
58280 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58281 Cond);
58282 }
58283
58284 return SDValue();
58285}
58286
58288 SDValue Op0 = N->getOperand(0);
58289 SDValue Op1 = N->getOperand(1);
58290
58291 // (sub C (zero_extend (setcc)))
58292 // =>
58293 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58294 // Don't disturb (sub 0 setcc), which is easily done with neg.
58295 EVT VT = N->getValueType(0);
58296 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58297 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58298 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58299 Op1.getOperand(0).hasOneUse()) {
58300 SDValue SetCC = Op1.getOperand(0);
58303 APInt NewImm = Op0C->getAPIntValue() - 1;
58304 SDLoc DL(Op1);
58305 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58306 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58307 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58308 DAG.getConstant(NewImm, DL, VT));
58309 }
58310
58311 return SDValue();
58312}
58313
58315 if (N->getConstantOperandVal(3) != X86::COND_NE)
58316 return SDValue();
58317
58318 SDValue Sub = N->getOperand(4);
58319 if (Sub.getOpcode() != X86ISD::SUB)
58320 return SDValue();
58321
58322 SDValue Op1 = Sub.getOperand(1);
58323
58324 if (!X86::isZeroNode(Sub.getOperand(0)))
58325 return SDValue();
58326
58327 SDLoc DL(N);
58328 SmallVector<SDValue, 5> Ops(N->op_values());
58329 if (Op1.getOpcode() == X86ISD::SETCC) {
58330 // res, flags2 = sub 0, (setcc cc, flag)
58331 // cload/cstore ..., cond_ne, flag2
58332 // ->
58333 // cload/cstore cc, flag
58334 Ops[3] = Op1.getOperand(0);
58335 Ops[4] = Op1.getOperand(1);
58336 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58337 SDValue Src = Op1;
58338 SDValue Op10 = Op1.getOperand(0);
58339 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58340 // res, flags2 = sub 0, (and (xor X, -1), Y)
58341 // cload/cstore ..., cond_ne, flag2
58342 // ->
58343 // res, flags2 = sub 0, (and X, Y)
58344 // cload/cstore ..., cond_e, flag2
58345 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58346 Op1.getOperand(1));
58347 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58348 }
58349 // res, flags2 = sub 0, (and X, Y)
58350 // cload/cstore ..., cc, flag2
58351 // ->
58352 // res, flags2 = cmp (and X, Y), 0
58353 // cload/cstore ..., cc, flag2
58354 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58355 } else {
58356 return SDValue();
58357 }
58358
58359 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58360 cast<MemSDNode>(N)->getMemoryVT(),
58361 cast<MemSDNode>(N)->getMemOperand());
58362}
58363
58366 const X86Subtarget &Subtarget) {
58367 EVT VT = N->getValueType(0);
58368 SDValue Op0 = N->getOperand(0);
58369 SDValue Op1 = N->getOperand(1);
58370 SDLoc DL(N);
58371
58372 auto IsNonOpaqueConstant = [&](SDValue Op) {
58374 /*AllowOpaques*/ false);
58375 };
58376
58377 // X86 can't encode an immediate LHS of a sub. See if we can push the
58378 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58379 // one use and a constant, invert the immediate, saving one register.
58380 // However, ignore cases where C1 is 0, as those will become a NEG.
58381 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58382 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58383 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58384 Op1->hasOneUse()) {
58385 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58386 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58387 SDValue NewAdd =
58388 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58389 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58390 }
58391
58392 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58393 return V;
58394
58395 // Try to synthesize horizontal subs from subs of shuffles.
58396 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58397 return V;
58398
58399 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58400 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58401 X86::isZeroNode(Op1.getOperand(1))) {
58402 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58403 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58404 Op1.getOperand(0), Op1.getOperand(2));
58405 }
58406
58407 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58408 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58409 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58410 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58411 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58412 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58413 Op1.getOperand(1), Op1.getOperand(2));
58414 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58415 }
58416
58417 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58418 return V;
58419
58420 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58421 return V;
58422
58423 return combineSubSetcc(N, DAG);
58424}
58425
58427 const X86Subtarget &Subtarget) {
58428 unsigned Opcode = N->getOpcode();
58429 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58430 "Unknown PCMP opcode");
58431
58432 SDValue LHS = N->getOperand(0);
58433 SDValue RHS = N->getOperand(1);
58434 MVT VT = N->getSimpleValueType(0);
58435 unsigned EltBits = VT.getScalarSizeInBits();
58436 unsigned NumElts = VT.getVectorNumElements();
58437 SDLoc DL(N);
58438
58439 if (LHS == RHS)
58440 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58441 : DAG.getConstant(0, DL, VT);
58442
58443 // Constant Folding.
58444 // PCMPEQ(X,UNDEF) -> UNDEF
58445 // PCMPGT(X,UNDEF) -> 0
58446 // PCMPGT(UNDEF,X) -> 0
58447 APInt LHSUndefs, RHSUndefs;
58448 SmallVector<APInt> LHSBits, RHSBits;
58449 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58450 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58451 APInt Ones = APInt::getAllOnes(EltBits);
58452 APInt Zero = APInt::getZero(EltBits);
58453 SmallVector<APInt> Results(NumElts);
58454 for (unsigned I = 0; I != NumElts; ++I) {
58455 if (Opcode == X86ISD::PCMPEQ) {
58456 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58457 } else {
58458 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58459 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58460 }
58461 }
58462 if (Opcode == X86ISD::PCMPEQ)
58463 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58464 return getConstVector(Results, VT, DAG, DL);
58465 }
58466
58467 return SDValue();
58468}
58469
58470// Helper to determine if we can convert an integer comparison to a float
58471// comparison byt casting the operands.
58472static std::optional<unsigned>
58473CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58474 unsigned NumSignificantBitsRHS) {
58475 MVT SVT = VT.getScalarType();
58476 assert(SVT == MVT::f32 && "Only tested for float so far");
58477 const fltSemantics &Sem = SVT.getFltSemantics();
58478 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58479 "Only PCMPEQ/PCMPGT currently supported");
58480
58481 // TODO: Handle bitcastable integers.
58482
58483 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58484 // a fp value.
58485 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58486 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58487 return ISD::SINT_TO_FP;
58488
58489 return std::nullopt;
58490}
58491
58492/// Helper that combines an array of subvector ops as if they were the operands
58493/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58494/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58497 const X86Subtarget &Subtarget,
58498 unsigned Depth) {
58499 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58500 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58501
58502 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58503 return DAG.getUNDEF(VT);
58504
58505 if (llvm::all_of(Ops, [](SDValue Op) {
58506 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58507 }))
58508 return getZeroVector(VT, Subtarget, DAG, DL);
58509
58511 return SDValue(); // Limit search depth.
58512
58513 SDValue Op0 = Ops[0];
58514 bool IsSplat = llvm::all_equal(Ops);
58515 unsigned NumOps = Ops.size();
58516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58517 LLVMContext &Ctx = *DAG.getContext();
58518
58519 // Repeated subvectors.
58520 if (IsSplat &&
58521 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58522 // If this broadcast is inserted into both halves, use a larger broadcast.
58523 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58524 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58525
58526 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58527 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58528 (Subtarget.hasAVX2() ||
58530 VT.getScalarType(), Subtarget)))
58531 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58532 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58533 Op0.getOperand(0),
58534 DAG.getVectorIdxConstant(0, DL)));
58535
58536 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58537 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58538 (Subtarget.hasAVX2() ||
58539 (EltSizeInBits >= 32 &&
58540 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58541 Op0.getOperand(0).getValueType() == VT.getScalarType())
58542 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58543
58544 // concat_vectors(extract_subvector(splat(x)),
58545 // extract_subvector(splat(x))) -> splat(x)
58546 // concat_vectors(extract_subvector(subv_broadcast(x)),
58547 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58548 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58549 Op0.getOperand(0).getValueType() == VT) {
58550 SDValue SrcVec = Op0.getOperand(0);
58551 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58552 return SrcVec;
58553 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58554 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58555 return SrcVec;
58556 }
58557
58558 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58559 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58560 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58561 return DAG.getNode(Op0.getOpcode(), DL, VT,
58563 Op0.getOperand(0), Op0.getOperand(0)),
58564 Op0.getOperand(1));
58565 }
58566
58567 // TODO: This should go in combineX86ShufflesRecursively eventually.
58568 if (NumOps == 2) {
58569 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58570 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58571 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58573 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58574 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58575 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58576 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58577 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58578 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58579 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58580 // Only concat of subvector high halves which vperm2x128 is best at or if
58581 // it should fold into a subvector broadcast.
58582 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58583 SrcVT1.is256BitVector()) {
58584 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58585 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58586 "Bad subvector index");
58587 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58588 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58589 unsigned Index = 0;
58590 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58591 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58592 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58593 DAG.getBitcast(VT, Src0.getOperand(0)),
58594 DAG.getBitcast(VT, Src1.getOperand(0)),
58595 DAG.getTargetConstant(Index, DL, MVT::i8));
58596 }
58597 }
58598 // Widen extract_subvector
58599 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58600 // --> extract_subvector(x,lo)
58601 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58602 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58603 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58604 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58605 return DAG.getBitcast(VT,
58607 Src0.getConstantOperandVal(1),
58608 DAG, DL, VT.getSizeInBits()));
58609 }
58610 }
58611 }
58612
58613 // Repeated opcode.
58614 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58615 // but it currently struggles with different vector widths.
58616 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58617 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58618 })) {
58619 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58621 for (SDValue SubOp : SubOps)
58622 Subs.push_back(SubOp.getOperand(I));
58623 // Attempt to peek through bitcasts and concat the original subvectors.
58624 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58625 if (SubVT.isSimple() && SubVT.isVector()) {
58626 MVT ConcatVT =
58628 SubVT.getVectorElementCount() * Subs.size());
58629 for (SDValue &Sub : Subs)
58630 Sub = DAG.getBitcast(SubVT, Sub);
58631 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58632 Subtarget, Depth + 1))
58633 return DAG.getBitcast(VT, ConcatSrc);
58634 return DAG.getBitcast(
58635 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58636 }
58637 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58638 };
58639 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58640 bool AllConstants = true;
58641 bool AllSubs = true;
58642 unsigned VecSize = VT.getSizeInBits();
58643 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58644 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58645 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58646 }))
58647 return true;
58648 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58649 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58650 unsigned SubSize = BC.getValueSizeInBits();
58651 unsigned EltSize = BC.getScalarValueSizeInBits();
58652 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58654 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58655 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58656 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58657 }
58658 return AllConstants || AllSubs;
58659 };
58660 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58661 bool AllConstants = true;
58663 for (SDValue SubOp : SubOps) {
58664 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58665 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58667 Subs.push_back(SubOp.getOperand(I));
58668 }
58669 if (AllConstants)
58670 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58671 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58672 };
58673
58674 unsigned Opcode = Op0.getOpcode();
58675 switch (Opcode) {
58676 case ISD::BITCAST: {
58677 // TODO: Support AVX1/AVX2 bitcasts.
58679 for (SDValue SubOp : Ops)
58680 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58681 EVT InnerVT = SubOps[0].getValueType();
58682 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58683 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58684 (Subtarget.hasBWI() ||
58685 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58686 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58687 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58688 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58689 return Op.getValueType() == InnerVT;
58690 })) {
58691 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58692 MVT ConcatVT = MVT::getVectorVT(
58693 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58694 if (SDValue ConcatSrc = combineConcatVectorOps(
58695 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58696 return DAG.getBitcast(VT, ConcatSrc);
58697 }
58698 break;
58699 }
58700 case ISD::VECTOR_SHUFFLE: {
58701 // TODO: Generalize NumOps support.
58702 if (!IsSplat && NumOps == 2 &&
58703 ((VT.is256BitVector() &&
58704 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58705 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58706 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58707 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58708 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58709 if (Concat0 || Concat1 ||
58710 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58711 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58712 Subtarget.hasVBMI())) {
58713 int NumSubElts = Op0.getValueType().getVectorNumElements();
58714 SmallVector<int> NewMask;
58715 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58716 M = M >= NumSubElts ? M + NumSubElts : M;
58717 NewMask.push_back(M);
58718 }
58719 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58720 if (0 <= M)
58721 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58722 NewMask.push_back(M);
58723 }
58724 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58725 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58726 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58727 }
58728 }
58729 break;
58730 }
58731 case X86ISD::VBROADCAST: {
58732 // TODO: 512-bit VBROADCAST concatenation.
58733 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58734 return Op.getOperand(0).getValueType().is128BitVector();
58735 })) {
58736 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58737 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58738 ConcatSubOperand(VT, Ops, 0),
58739 ConcatSubOperand(VT, Ops, 0));
58740 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58741 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58742 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58744 DL, VT, ConcatSubOperand(VT, Ops, 0),
58745 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58746 }
58747 break;
58748 }
58749 case X86ISD::MOVDDUP:
58750 case X86ISD::MOVSHDUP:
58751 case X86ISD::MOVSLDUP: {
58752 if (!IsSplat && (VT.is256BitVector() ||
58753 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58754 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58755 break;
58756 }
58757 case X86ISD::SHUFP: {
58758 if (!IsSplat &&
58759 (VT == MVT::v8f32 ||
58760 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58761 llvm::all_of(Ops, [Op0](SDValue Op) {
58762 return Op.getOperand(2) == Op0.getOperand(2);
58763 })) {
58764 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58765 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58766 if (Concat0 || Concat1)
58767 return DAG.getNode(Opcode, DL, VT,
58768 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58769 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58770 Op0.getOperand(2));
58771 }
58772 break;
58773 }
58774 case X86ISD::UNPCKH:
58775 case X86ISD::UNPCKL: {
58776 // TODO: UNPCK should use CombineSubOperand
58777 // Don't concatenate build_vector patterns.
58778 if (!IsSplat &&
58779 ((VT.is256BitVector() &&
58780 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58781 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58782 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58783 none_of(Ops, [](SDValue Op) {
58784 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58786 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58788 })) {
58789 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58790 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58791 if (Concat0 || Concat1 ||
58792 (Subtarget.hasInt256() && EltSizeInBits == 64))
58793 return DAG.getNode(Opcode, DL, VT,
58794 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58795 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58796 }
58797 break;
58798 }
58799 case X86ISD::PSHUFHW:
58800 case X86ISD::PSHUFLW:
58801 case X86ISD::PSHUFD:
58802 if (!IsSplat &&
58803 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58804 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58805 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58806 llvm::all_of(Ops, [Op0](SDValue Op) {
58807 return Op.getOperand(1) == Op0.getOperand(1);
58808 })) {
58809 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58810 Op0.getOperand(1));
58811 }
58812 [[fallthrough]];
58813 case X86ISD::VPERMILPI:
58814 if (!IsSplat && EltSizeInBits == 32 &&
58815 (VT.is256BitVector() ||
58816 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58817 all_of(Ops, [&Op0](SDValue Op) {
58818 return Op0.getOperand(1) == Op.getOperand(1);
58819 })) {
58820 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58821 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58822 Res =
58823 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58824 return DAG.getBitcast(VT, Res);
58825 }
58826 break;
58827 case X86ISD::VPERMILPV:
58828 if (!IsSplat && (VT.is256BitVector() ||
58829 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58830 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58831 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58832 if (Concat0 || Concat1)
58833 return DAG.getNode(Opcode, DL, VT,
58834 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58835 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58836 }
58837 break;
58838 case X86ISD::PSHUFB:
58839 case X86ISD::PSADBW:
58840 case X86ISD::VPMADDUBSW:
58841 case X86ISD::VPMADDWD:
58842 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58843 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58844 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58845 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58846 NumOps * SrcVT.getVectorNumElements());
58847 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58848 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58849 if (Concat0 || Concat1)
58850 return DAG.getNode(
58851 Opcode, DL, VT,
58852 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58853 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58854 }
58855 break;
58856 case X86ISD::VPERMV:
58857 // TODO: Handle 256-bit and NumOps == 4 cases.
58858 if (!IsSplat && NumOps == 2 &&
58859 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58860 MVT OpVT = Op0.getSimpleValueType();
58861 int NumSrcElts = OpVT.getVectorNumElements();
58862 SmallVector<int, 64> ConcatMask;
58863 for (unsigned i = 0; i != NumOps; ++i) {
58864 SmallVector<int, 64> SubMask;
58866 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58867 break;
58868 for (int M : SubMask) {
58869 if (0 <= M)
58870 M += i * NumSrcElts;
58871 ConcatMask.push_back(M);
58872 }
58873 }
58874 if (ConcatMask.size() == (NumOps * NumSrcElts))
58875 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58876 ConcatSubOperand(VT, Ops, 1),
58877 DAG.getUNDEF(VT), Subtarget, DAG);
58878 }
58879 break;
58880 case X86ISD::VPERMV3:
58881 // TODO: Handle 256-bit and NumOps == 4 cases.
58882 if (!IsSplat && NumOps == 2 &&
58883 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58884 MVT OpVT = Op0.getSimpleValueType();
58885 int NumSrcElts = OpVT.getVectorNumElements();
58886 SmallVector<int, 64> ConcatMask;
58887 for (unsigned i = 0; i != NumOps; ++i) {
58888 SmallVector<int, 64> SubMask;
58890 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58891 break;
58892 for (int M : SubMask) {
58893 if (0 <= M) {
58894 int Src = M < NumSrcElts ? 0 : 2;
58895 M += M < NumSrcElts ? 0 : NumSrcElts;
58896
58897 // Reference the lowest sub if the upper sub is the same.
58898 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58899 M += i * NumSrcElts;
58900 }
58901 ConcatMask.push_back(M);
58902 }
58903 }
58904 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58905 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58906 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58907 if (Concat0 || Concat1)
58908 return lowerShuffleWithPERMV(
58909 DL, VT, ConcatMask,
58910 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58911 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58912 DAG);
58913 }
58914 }
58915 break;
58916 case X86ISD::VPERM2X128: {
58917 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58918 assert(NumOps == 2 && "Bad concat_vectors operands");
58919 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58920 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58921 // TODO: Handle zero'd subvectors.
58922 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58923 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58924 (int)((Imm1 >> 4) & 0x3)};
58925 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58926 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58927 Ops[0].getOperand(1), DAG, DL);
58928 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58929 Ops[1].getOperand(1), DAG, DL);
58930 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58931 DAG.getBitcast(ShuffleVT, LHS),
58932 DAG.getBitcast(ShuffleVT, RHS),
58933 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58934 return DAG.getBitcast(VT, Res);
58935 }
58936 }
58937 break;
58938 }
58939 case X86ISD::SHUF128: {
58940 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58941 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58942 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58943 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58944 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58945 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58946 Ops[0].getOperand(1), DAG, DL);
58947 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58948 Ops[1].getOperand(1), DAG, DL);
58949 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58950 DAG.getTargetConstant(Imm, DL, MVT::i8));
58951 }
58952 break;
58953 }
58954 case ISD::TRUNCATE:
58955 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58956 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58957 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58958 SrcVT == Ops[1].getOperand(0).getValueType() &&
58959 Subtarget.useAVX512Regs() &&
58960 Subtarget.getPreferVectorWidth() >= 512 &&
58961 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58962 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58963 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58964 ConcatSubOperand(NewSrcVT, Ops, 0));
58965 }
58966 }
58967 break;
58968 case ISD::ANY_EXTEND:
58969 case ISD::SIGN_EXTEND:
58970 case ISD::ZERO_EXTEND:
58971 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58972 if (!IsSplat && NumOps == 2 &&
58973 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58974 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58975 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58976 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58977 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58978 SrcVT == Ops[1].getOperand(0).getValueType()) {
58979 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58980 return DAG.getNode(Opcode, DL, VT,
58981 ConcatSubOperand(NewSrcVT, Ops, 0));
58982 }
58983 }
58984 break;
58988 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58989 if (!IsSplat && NumOps == 2 &&
58990 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58991 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58992 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58994 Op0.getOperand(0).getValueType() ==
58995 Ops[0].getOperand(0).getValueType()) {
58996 EVT SrcVT = Op0.getOperand(0).getValueType();
58997 unsigned NumElts = VT.getVectorNumElements();
58998 MVT UnpackSVT =
58999 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
59000 MVT UnpackVT =
59001 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
59002 SDValue Unpack =
59003 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
59004 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
59005 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
59006 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
59007 DAG.getBitcast(SrcVT, Unpack), DAG);
59008 }
59009 break;
59010 }
59011 case X86ISD::VSHLI:
59012 case X86ISD::VSRLI:
59013 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
59014 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
59015 llvm::all_of(Ops, [](SDValue Op) {
59016 return Op.getConstantOperandAPInt(1) == 32;
59017 })) {
59018 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
59019 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
59020 Res = DAG.getBitcast(MVT::v8i32, Res);
59021 if (Opcode == X86ISD::VSHLI) {
59022 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59023 {8, 0, 8, 2, 8, 4, 8, 6});
59024 } else {
59025 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59026 {1, 8, 3, 8, 5, 8, 7, 8});
59027 }
59028 return DAG.getBitcast(VT, Res);
59029 }
59030 }
59031 [[fallthrough]];
59032 case X86ISD::VSRAI:
59033 case X86ISD::VSHL:
59034 case X86ISD::VSRL:
59035 case X86ISD::VSRA:
59036 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
59037 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59038 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
59039 llvm::all_of(Ops, [Op0](SDValue Op) {
59040 return Op0.getOperand(1) == Op.getOperand(1);
59041 })) {
59042 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59043 Op0.getOperand(1));
59044 }
59045 break;
59046 case X86ISD::VPERMI:
59047 case X86ISD::VROTLI:
59048 case X86ISD::VROTRI:
59049 if (!IsSplat &&
59050 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59051 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59052 llvm::all_of(Ops, [Op0](SDValue Op) {
59053 return Op0.getOperand(1) == Op.getOperand(1);
59054 })) {
59055 assert(!(Opcode == X86ISD::VPERMI &&
59056 Op0.getValueType().is128BitVector()) &&
59057 "Illegal 128-bit X86ISD::VPERMI nodes");
59058 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59059 Op0.getOperand(1));
59060 }
59061 break;
59062 case ISD::AND:
59063 case ISD::OR:
59064 case ISD::XOR:
59065 case X86ISD::ANDNP:
59066 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59067 if (!IsSplat && (VT.is256BitVector() ||
59068 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59069 // Don't concatenate root AVX1 NOT patterns.
59070 // TODO: Allow NOT folding if Concat0 succeeds.
59071 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59072 llvm::all_of(Ops, [](SDValue X) {
59073 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59074 }))
59075 break;
59076 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59077 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59078 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59079 return DAG.getNode(Opcode, DL, VT,
59080 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59081 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59082 }
59083 break;
59084 case X86ISD::PCMPEQ:
59085 case X86ISD::PCMPGT:
59086 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59087 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59088 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59089 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59090 if (Concat0 || Concat1)
59091 return DAG.getNode(Opcode, DL, VT,
59092 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59093 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59094 break;
59095 }
59096
59097 if (!IsSplat && VT == MVT::v8i32) {
59098 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59099 // TODO: Handle v4f64 as well?
59100 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59101 for (unsigned I = 0; I != NumOps; ++I) {
59102 MaxSigBitsLHS =
59103 std::max(MaxSigBitsLHS,
59104 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59105 MaxSigBitsRHS =
59106 std::max(MaxSigBitsRHS,
59107 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59108 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59109 break;
59110 }
59111
59112 ISD::CondCode ICC =
59113 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59114 ISD::CondCode FCC =
59116
59117 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59118 MVT FpVT = VT.changeVectorElementType(FpSVT);
59119
59120 if (std::optional<unsigned> CastOpc =
59121 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59122 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59123 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59124 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59125 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59126 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59127 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59128
59129 bool IsAlwaysSignaling;
59130 unsigned FSETCC =
59131 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59132 return DAG.getBitcast(
59133 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59134 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59135 }
59136 }
59137 break;
59138 case ISD::CTPOP:
59139 case ISD::CTTZ:
59140 case ISD::CTLZ:
59143 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59144 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59145 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59146 }
59147 break;
59149 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59150 if (!IsSplat &&
59151 (VT.is256BitVector() ||
59152 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59153 llvm::all_of(Ops, [Op0](SDValue Op) {
59154 return Op0.getOperand(2) == Op.getOperand(2);
59155 })) {
59156 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59157 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59158 }
59159 break;
59160 case ISD::ADD:
59161 case ISD::SUB:
59162 case ISD::MUL:
59163 // TODO: Add more integer binops?
59164 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59165 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59166 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59167 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59168 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59169 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59170 return Op.getOperand(0) == Op.getOperand(1);
59171 }))
59172 return DAG.getNode(Opcode, DL, VT,
59173 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59174 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59175 }
59176 break;
59177 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59178 // their latency are short, so here we don't replace them unless we won't
59179 // introduce extra VINSERT.
59180 case ISD::FADD:
59181 case ISD::FSUB:
59182 case ISD::FMUL:
59183 if (!IsSplat && (VT.is256BitVector() ||
59184 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59185 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59186 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59187 if (Concat0 || Concat1)
59188 return DAG.getNode(Opcode, DL, VT,
59189 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59190 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59191 }
59192 break;
59193 // Always prefer to concatenate high latency FDIV instructions.
59194 case ISD::FDIV:
59195 if (!IsSplat && (VT.is256BitVector() ||
59196 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59197 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59198 ConcatSubOperand(VT, Ops, 1));
59199 }
59200 break;
59201 case X86ISD::HADD:
59202 case X86ISD::HSUB:
59203 case X86ISD::FHADD:
59204 case X86ISD::FHSUB:
59205 if (!IsSplat && VT.is256BitVector() &&
59206 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59207 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59208 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59209 if (Concat0 || Concat1)
59210 return DAG.getNode(Opcode, DL, VT,
59211 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59212 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59213 }
59214 break;
59215 case X86ISD::PACKSS:
59216 case X86ISD::PACKUS:
59217 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59218 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59219 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59220 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59221 NumOps * SrcVT.getVectorNumElements());
59222 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59223 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59224 if (Concat0 || Concat1)
59225 return DAG.getNode(
59226 Opcode, DL, VT,
59227 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59228 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59229 }
59230 break;
59231 case X86ISD::VSHLD:
59232 case X86ISD::VSHRD:
59233 case X86ISD::PALIGNR:
59234 if (!IsSplat &&
59235 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59236 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59237 llvm::all_of(Ops, [Op0](SDValue Op) {
59238 return Op0.getOperand(2) == Op.getOperand(2);
59239 })) {
59240 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59241 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59242 if (Concat0 || Concat1)
59243 return DAG.getNode(Opcode, DL, VT,
59244 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59245 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59246 Op0.getOperand(2));
59247 }
59248 break;
59249 case X86ISD::BLENDI:
59250 if (VT.is256BitVector() && NumOps == 2 &&
59251 (EltSizeInBits >= 32 ||
59252 (Subtarget.hasInt256() &&
59253 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59254 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59255 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59256 if (Concat0 || Concat1) {
59257 unsigned NumElts = VT.getVectorNumElements();
59258 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59259 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59260 Mask = Mask.zextOrTrunc(8);
59261 return DAG.getNode(Opcode, DL, VT,
59262 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59263 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59264 DAG.getTargetConstant(Mask, DL, MVT::i8));
59265 }
59266 }
59267 // TODO: BWI targets should only use CombineSubOperand.
59268 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59269 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59270 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59271 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59272 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59273 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59274 unsigned NumElts = VT.getVectorNumElements();
59275 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59276 for (unsigned I = 1; I != NumOps; ++I)
59277 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59278 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59279 Mask = Mask.zextOrTrunc(NumMaskBits);
59280 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59281 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59282 SDValue Sel =
59283 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59284 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59285 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59286 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59287 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59288 }
59289 }
59290 break;
59291 case ISD::VSELECT:
59292 // TODO: VSELECT should use CombineSubOperand.
59293 if (!IsSplat && Subtarget.hasAVX512() &&
59294 (VT.is256BitVector() ||
59295 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59296 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59297 EVT SelVT = Ops[0].getOperand(0).getValueType();
59298 if (SelVT.getVectorElementType() == MVT::i1) {
59299 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59300 NumOps * SelVT.getVectorNumElements());
59301 if (TLI.isTypeLegal(SelVT))
59302 return DAG.getNode(
59303 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59304 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59305 }
59306 }
59307 [[fallthrough]];
59308 case X86ISD::BLENDV:
59309 // TODO: BLENDV should use CombineSubOperand.
59310 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59311 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59312 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59313 EVT SelVT = Ops[0].getOperand(0).getValueType();
59314 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59315 if (TLI.isTypeLegal(SelVT))
59316 return DAG.getNode(
59317 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59318 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59319 }
59320 break;
59321 }
59322 }
59323
59324 // Fold subvector loads into one.
59325 // If needed, look through bitcasts to get to the load.
59326 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59327 unsigned Fast;
59328 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59329 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59330 *FirstLd->getMemOperand(), &Fast) &&
59331 Fast) {
59332 if (SDValue Ld =
59333 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59334 return Ld;
59335 }
59336 }
59337
59338 // Attempt to fold target constant loads.
59339 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59340 SmallVector<APInt> EltBits;
59341 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59342 for (unsigned I = 0; I != NumOps; ++I) {
59343 APInt OpUndefElts;
59344 SmallVector<APInt> OpEltBits;
59345 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59346 OpEltBits, /*AllowWholeUndefs*/ true,
59347 /*AllowPartialUndefs*/ false))
59348 break;
59349 EltBits.append(OpEltBits);
59350 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59351 }
59352 if (EltBits.size() == VT.getVectorNumElements()) {
59353 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59354 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59355 SDValue CV = DAG.getConstantPool(C, PVT);
59358 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59359 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59361 return Ld;
59362 }
59363 }
59364
59365 // If this simple subvector or scalar/subvector broadcast_load is inserted
59366 // into both halves, use a larger broadcast_load. Update other uses to use
59367 // an extracted subvector.
59368 if (IsSplat &&
59369 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59370 if (ISD::isNormalLoad(Op0.getNode()) ||
59373 auto *Mem = cast<MemSDNode>(Op0);
59374 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59377 if (SDValue BcastLd =
59378 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59379 SDValue BcastSrc =
59380 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59381 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59382 return BcastLd;
59383 }
59384 }
59385 }
59386
59387 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59388 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59389 Subtarget.useAVX512Regs()) {
59390 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59391 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59392 Res = DAG.getBitcast(ShuffleVT, Res);
59393 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59394 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59395 return DAG.getBitcast(VT, Res);
59396 }
59397
59398 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59399 if (!IsSplat &&
59400 ((NumOps == 2 && VT == MVT::v4f64) ||
59401 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59402 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59403 // Collect the individual per-lane v2f64/v4f64 shuffles.
59404 MVT OpVT = Ops[0].getSimpleValueType();
59405 unsigned NumOpElts = OpVT.getVectorNumElements();
59408 if (all_of(seq<int>(NumOps), [&](int I) {
59409 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59410 Depth + 1) &&
59411 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59412 none_of(SrcMasks[I], isUndefOrZero) &&
59413 SrcMasks[I].size() == NumOpElts &&
59414 all_of(SrcOps[I], [&OpVT](SDValue V) {
59415 return V.getValueType() == OpVT;
59416 });
59417 })) {
59418 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59419 bool Unary = true;
59420 unsigned SHUFPDMask = 0;
59422 for (unsigned I = 0; I != NumOps; ++I) {
59423 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59424 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59425 Unary &= LHS[I] == RHS[I];
59426 for (unsigned J = 0; J != NumOpElts; ++J)
59427 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59428 }
59429 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59430 // PERMILPD mask and we can always profitably concatenate them.
59431 SDValue Concat0 =
59432 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59433 SDValue Concat1 =
59434 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59435 if (Unary || Concat0 || Concat1) {
59436 Concat0 =
59437 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59438 Concat1 =
59439 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59440 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59441 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59442 }
59443 }
59444 }
59445
59446 return SDValue();
59447}
59448
59451 const X86Subtarget &Subtarget) {
59452 EVT VT = N->getValueType(0);
59453 EVT SrcVT = N->getOperand(0).getValueType();
59454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59456
59457 if (VT.getVectorElementType() == MVT::i1) {
59458 // Attempt to constant fold.
59459 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59461 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59463 if (!C) break;
59464 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59465 if (I == (E - 1)) {
59466 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59467 if (TLI.isTypeLegal(IntVT))
59468 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59469 }
59470 }
59471
59472 // Don't do anything else for i1 vectors.
59473 return SDValue();
59474 }
59475
59476 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59477 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59478 Subtarget))
59479 return R;
59480 }
59481
59482 return SDValue();
59483}
59484
59487 const X86Subtarget &Subtarget) {
59488 if (DCI.isBeforeLegalizeOps())
59489 return SDValue();
59490
59491 MVT OpVT = N->getSimpleValueType(0);
59492
59493 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59494
59495 SDLoc dl(N);
59496 SDValue Vec = N->getOperand(0);
59497 SDValue SubVec = N->getOperand(1);
59498
59499 uint64_t IdxVal = N->getConstantOperandVal(2);
59500 MVT SubVecVT = SubVec.getSimpleValueType();
59501 int VecNumElts = OpVT.getVectorNumElements();
59502 int SubVecNumElts = SubVecVT.getVectorNumElements();
59503
59504 if (Vec.isUndef() && SubVec.isUndef())
59505 return DAG.getUNDEF(OpVT);
59506
59507 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59508 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59509 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59510 return getZeroVector(OpVT, Subtarget, DAG, dl);
59511
59513 // If we're inserting into a zero vector and then into a larger zero vector,
59514 // just insert into the larger zero vector directly.
59515 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59517 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59518 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59519 getZeroVector(OpVT, Subtarget, DAG, dl),
59520 SubVec.getOperand(1),
59521 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59522 }
59523
59524 // If we're inserting into a zero vector and our input was extracted from an
59525 // insert into a zero vector of the same type and the extraction was at
59526 // least as large as the original insertion. Just insert the original
59527 // subvector into a zero vector.
59528 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59529 isNullConstant(SubVec.getOperand(1)) &&
59531 SDValue Ins = SubVec.getOperand(0);
59532 if (isNullConstant(Ins.getOperand(2)) &&
59533 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59534 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59535 SubVecVT.getFixedSizeInBits())
59536 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59537 getZeroVector(OpVT, Subtarget, DAG, dl),
59538 Ins.getOperand(1), N->getOperand(2));
59539 }
59540 }
59541
59542 // Stop here if this is an i1 vector.
59543 if (IsI1Vector)
59544 return SDValue();
59545
59546 // Eliminate an intermediate vector widening:
59547 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59548 // insert_subvector X, Y, Idx
59549 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59550 // there?
59551 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59552 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59553 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59554 SubVec.getOperand(1), N->getOperand(2));
59555
59556 // If this is an insert of an extract, combine to a shuffle. Don't do this
59557 // if the insert or extract can be represented with a subregister operation.
59558 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59559 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59560 (IdxVal != 0 ||
59561 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59562 SDValue ExtSrc = SubVec.getOperand(0);
59563 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59564 // Create a shuffle mask matching the extraction and insertion.
59565 SmallVector<int, 64> Mask(VecNumElts);
59566 std::iota(Mask.begin(), Mask.end(), 0);
59567 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59568 ExtIdxVal + VecNumElts);
59569 if (ExtIdxVal != 0)
59570 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59571 // See if we can use a blend instead of extract/insert pair.
59572 SmallVector<int, 64> BlendMask(VecNumElts);
59573 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59574 std::iota(BlendMask.begin() + IdxVal,
59575 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59576 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59577 VecNumElts == (2 * SubVecNumElts)) {
59578 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59579 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59580 SDValue Blend = DAG.getNode(
59581 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59582 DAG.getBitcast(MVT::v8f32, ExtSrc),
59583 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59584 return DAG.getBitcast(OpVT, Blend);
59585 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59586 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59587 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59588 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59589 SDValue Shuffle =
59590 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59591 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59592 return DAG.getBitcast(OpVT, Shuffle);
59593 }
59594 }
59595 }
59596
59597 // Match concat_vector style patterns.
59598 SmallVector<SDValue, 2> SubVectorOps;
59599 if (collectConcatOps(N, SubVectorOps, DAG)) {
59600 if (SDValue Fold =
59601 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59602 return Fold;
59603
59604 // If we're inserting all zeros into the upper half, change this to
59605 // a concat with zero. We will match this to a move
59606 // with implicit upper bit zeroing during isel.
59607 // We do this here because we don't want combineConcatVectorOps to
59608 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59609 if (SubVectorOps.size() == 2 &&
59610 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59611 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59612 getZeroVector(OpVT, Subtarget, DAG, dl),
59613 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59614
59615 // Attempt to recursively combine to a shuffle.
59616 if (all_of(SubVectorOps, [](SDValue SubOp) {
59618 })) {
59619 SDValue Op(N, 0);
59620 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59621 return Res;
59622 }
59623 }
59624
59625 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59626 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59627 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59628
59629 // If this is a broadcast load inserted into an upper undef, use a larger
59630 // broadcast load.
59631 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59632 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59633 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59635 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59636 }
59637
59638 // If we're splatting the lower half subvector of a full vector load into the
59639 // upper half, attempt to create a subvector broadcast.
59640 if ((int)IdxVal == (VecNumElts / 2) &&
59641 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59642 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59643 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59644 if (VecLd && SubLd &&
59646 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59648 SubVecVT, SubLd, 0, DAG);
59649 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59650 BcastLd, DAG.getVectorIdxConstant(0, dl));
59651 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59652 return BcastLd;
59653 }
59654 }
59655
59656 // Attempt to constant fold (if we're not widening).
59657 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59658 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59659 APInt VecUndefElts, SubUndefElts;
59660 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59661 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59662 VecEltBits) &&
59663 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59664 SubEltBits)) {
59665 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59666 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59667 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59668 }
59669 }
59670
59671 // Attempt to recursively combine to a shuffle.
59674 SDValue Op(N, 0);
59675 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59676 return Res;
59677 }
59678
59679 // Match insertion of subvector load that perfectly aliases a base load.
59680 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59681 ISD::isNormalLoad(SubVec.getNode()) &&
59683 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59684 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59685 return Vec;
59686
59687 return SDValue();
59688}
59689
59690/// If we are extracting a subvector of a vector select and the select condition
59691/// is composed of concatenated vectors, try to narrow the select width. This
59692/// is a common pattern for AVX1 integer code because 256-bit selects may be
59693/// legal, but there is almost no integer math/logic available for 256-bit.
59694/// This function should only be called with legal types (otherwise, the calls
59695/// to get simple value types will assert).
59697 SelectionDAG &DAG) {
59698 SDValue Sel = Ext->getOperand(0);
59699 if (Sel.getOpcode() != ISD::VSELECT ||
59700 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59701 return SDValue();
59702
59703 // Note: We assume simple value types because this should only be called with
59704 // legal operations/types.
59705 // TODO: This can be extended to handle extraction to 256-bits.
59706 MVT VT = Ext->getSimpleValueType(0);
59707 if (!VT.is128BitVector())
59708 return SDValue();
59709
59710 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59711 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59712 return SDValue();
59713
59714 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59715 MVT SelVT = Sel.getSimpleValueType();
59716 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59717 "Unexpected vector type with legal operations");
59718
59719 unsigned SelElts = SelVT.getVectorNumElements();
59720 unsigned CastedElts = WideVT.getVectorNumElements();
59721 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59722 if (SelElts % CastedElts == 0) {
59723 // The select has the same or more (narrower) elements than the extract
59724 // operand. The extraction index gets scaled by that factor.
59725 ExtIdx *= (SelElts / CastedElts);
59726 } else if (CastedElts % SelElts == 0) {
59727 // The select has less (wider) elements than the extract operand. Make sure
59728 // that the extraction index can be divided evenly.
59729 unsigned IndexDivisor = CastedElts / SelElts;
59730 if (ExtIdx % IndexDivisor != 0)
59731 return SDValue();
59732 ExtIdx /= IndexDivisor;
59733 } else {
59734 llvm_unreachable("Element count of simple vector types are not divisible?");
59735 }
59736
59737 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59738 unsigned NarrowElts = SelElts / NarrowingFactor;
59739 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59740 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59741 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59742 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59743 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59744 return DAG.getBitcast(VT, NarrowSel);
59745}
59746
59749 const X86Subtarget &Subtarget) {
59750 if (!N->getValueType(0).isSimple())
59751 return SDValue();
59752
59753 MVT VT = N->getSimpleValueType(0);
59754 SDValue InVec = N->getOperand(0);
59755 unsigned IdxVal = N->getConstantOperandVal(1);
59756 EVT InVecVT = InVec.getValueType();
59757 unsigned SizeInBits = VT.getSizeInBits();
59758 unsigned InSizeInBits = InVecVT.getSizeInBits();
59759 unsigned NumSubElts = VT.getVectorNumElements();
59760 unsigned NumInElts = InVecVT.getVectorNumElements();
59761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59762 SDLoc DL(N);
59763
59764 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59765 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59766 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59767 // We let generic combining take over from there to simplify the
59768 // insert/extract and 'not'.
59769 // This pattern emerges during AVX1 legalization. We handle it before lowering
59770 // to avoid complications like splitting constant vector loads.
59771 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59772 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59773 auto isConcatenatedNot = [](SDValue V) {
59774 V = peekThroughBitcasts(V);
59775 if (!isBitwiseNot(V))
59776 return false;
59777 SDValue NotOp = V->getOperand(0);
59779 };
59780 if (isConcatenatedNot(InVec.getOperand(0)) ||
59781 isConcatenatedNot(InVec.getOperand(1))) {
59782 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59783 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59784 splitVectorIntBinary(InVec, DAG, DL),
59785 N->getOperand(1));
59786 }
59787 }
59788
59789 if (DCI.isBeforeLegalizeOps())
59790 return SDValue();
59791
59792 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59793 return V;
59794
59796 return getZeroVector(VT, Subtarget, DAG, DL);
59797
59798 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59799 if (VT.getScalarType() == MVT::i1)
59800 return DAG.getConstant(1, DL, VT);
59801 return getOnesVector(VT, DAG, DL);
59802 }
59803
59804 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59805 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59806
59807 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59808 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59809 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59810 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59811 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59812 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59813 }
59814
59815 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59816 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59817 // iff SUB is entirely contained in the extraction.
59818 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59819 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59820 SDValue Src = InVec.getOperand(0);
59821 SDValue Sub = InVec.getOperand(1);
59822 EVT SubVT = Sub.getValueType();
59823 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59824 if (IdxVal <= InsIdx &&
59825 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59826 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59827 DAG.getVectorIdxConstant(IdxVal, DL));
59828 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59829 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59830 }
59831 }
59832
59833 // If we're extracting an upper subvector see if we'd get the same elements if
59834 // we extracted the lowest subvector instead which should allow
59835 // SimplifyDemandedVectorElts do more simplifications.
59836 if (IdxVal != 0) {
59837 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59838 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59839 });
59840 if (AllEquiv)
59841 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59842 }
59843
59844 // Check if we're extracting a whole broadcasted subvector.
59845 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59846 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59847 EVT MemVT = MemIntr->getMemoryVT();
59848 if (MemVT == VT) {
59849 // If this is the only use, we can replace with a regular load (this may
59850 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59851 // memory chain).
59852 if (InVec.hasOneUse()) {
59853 SDValue Ld =
59854 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59855 MemIntr->getMemOperand());
59856 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59857 return Ld;
59858 }
59859 }
59860 }
59861
59862 // Attempt to extract from the source of a shuffle vector.
59863 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59864 SmallVector<int, 32> ShuffleMask;
59865 SmallVector<int, 32> ScaledMask;
59866 SmallVector<SDValue, 2> ShuffleInputs;
59867 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59868 // Decode the shuffle mask and scale it so its shuffling subvectors.
59869 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59870 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59871 unsigned SubVecIdx = IdxVal / NumSubElts;
59872 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59873 return DAG.getUNDEF(VT);
59874 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59875 return getZeroVector(VT, Subtarget, DAG, DL);
59876 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59877 if (Src.getValueSizeInBits() == InSizeInBits) {
59878 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59879 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59880 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59881 DL, SizeInBits);
59882 }
59883 }
59884 }
59885
59886 auto IsExtractFree = [](SDValue V) {
59887 if (V.hasOneUse()) {
59889 if (V.getOpcode() == ISD::LOAD)
59890 return true;
59891 }
59892 V = peekThroughBitcasts(V);
59893 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59894 return true;
59896 return true;
59897 return V.isUndef();
59898 };
59899
59900 // If we're extracting the lowest subvector and we're the only user,
59901 // we may be able to perform this with a smaller vector width.
59902 unsigned InOpcode = InVec.getOpcode();
59903 if (InVec.hasOneUse()) {
59904 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59905 // v2f64 CVTDQ2PD(v4i32).
59906 if (InOpcode == ISD::SINT_TO_FP &&
59907 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59908 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59909 }
59910 // v2f64 CVTUDQ2PD(v4i32).
59911 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59912 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59913 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59914 }
59915 // v2f64 CVTPS2PD(v4f32).
59916 if (InOpcode == ISD::FP_EXTEND &&
59917 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59918 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59919 }
59920 }
59921 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59922 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59923 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59924 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59925 Subtarget.hasVLX())) &&
59926 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59927 SDValue Src = InVec.getOperand(0);
59928 if (Src.getValueType().getScalarSizeInBits() == 32)
59929 return DAG.getNode(InOpcode, DL, VT,
59930 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59931 }
59932 if (IdxVal == 0 &&
59933 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59934 (SizeInBits == 128 || SizeInBits == 256) &&
59935 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59936 SDValue Ext = InVec.getOperand(0);
59937 if (Ext.getValueSizeInBits() > SizeInBits)
59938 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59939 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59940 return DAG.getNode(ExtOp, DL, VT, Ext);
59941 }
59942 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59943 InVec.getOperand(0).getValueType().is256BitVector() &&
59944 InVec.getOperand(1).getValueType().is256BitVector() &&
59945 InVec.getOperand(2).getValueType().is256BitVector()) {
59946 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59947 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59948 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59949 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59950 }
59951 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59952 (SizeInBits == 128 || SizeInBits == 256)) {
59953 SDValue InVecSrc = InVec.getOperand(0);
59954 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59955 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59956 return DAG.getNode(InOpcode, DL, VT, Ext);
59957 }
59958
59959 if (SizeInBits == 128 || SizeInBits == 256) {
59960 switch (InOpcode) {
59961 case X86ISD::MOVDDUP:
59962 return DAG.getNode(
59963 InOpcode, DL, VT,
59964 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59965 case X86ISD::PSHUFD:
59966 case X86ISD::VPERMILPI:
59967 if (InVec.getOperand(0).hasOneUse()) {
59968 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59969 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59970 return DAG.getNode(InOpcode, DL, VT,
59971 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59972 DL, SizeInBits),
59973 DAG.getTargetConstant(M, DL, MVT::i8));
59974 }
59975 break;
59976 case X86ISD::PCMPEQ:
59977 case X86ISD::PCMPGT:
59978 case X86ISD::UNPCKH:
59979 case X86ISD::UNPCKL:
59980 if (IsExtractFree(InVec.getOperand(0)) ||
59981 IsExtractFree(InVec.getOperand(1)))
59982 return DAG.getNode(InOpcode, DL, VT,
59983 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59984 DL, SizeInBits),
59985 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59986 DL, SizeInBits));
59987 break;
59988 case X86ISD::CMPP:
59989 if (IsExtractFree(InVec.getOperand(0)) ||
59990 IsExtractFree(InVec.getOperand(1)))
59991 return DAG.getNode(InOpcode, DL, VT,
59992 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59993 DL, SizeInBits),
59994 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59995 DL, SizeInBits),
59996 InVec.getOperand(2));
59997 break;
59998 case X86ISD::BLENDI:
59999 if (IsExtractFree(InVec.getOperand(0)) ||
60000 IsExtractFree(InVec.getOperand(1))) {
60001 uint64_t M = InVec.getConstantOperandVal(2) & 255;
60002 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
60003 return DAG.getNode(InOpcode, DL, VT,
60004 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
60005 DL, SizeInBits),
60006 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
60007 DL, SizeInBits),
60008 DAG.getTargetConstant(M, DL, MVT::i8));
60009 }
60010 break;
60011 case X86ISD::VPERMV:
60012 if (IdxVal != 0) {
60013 SDValue Mask = InVec.getOperand(0);
60014 SDValue Src = InVec.getOperand(1);
60015 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60016 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60017 DL, InSizeInBits);
60018 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
60019 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60020 }
60021 break;
60022 case X86ISD::VPERMV3:
60023 if (IdxVal != 0) {
60024 SDValue Src0 = InVec.getOperand(0);
60025 SDValue Mask = InVec.getOperand(1);
60026 SDValue Src1 = InVec.getOperand(2);
60027 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60028 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60029 DL, InSizeInBits);
60030 SDValue Shuffle =
60031 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
60032 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60033 }
60034 break;
60035 }
60036 }
60037 }
60038
60039 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
60040 // as this is very likely to fold into a shuffle/truncation.
60041 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60042 InVecVT.getScalarSizeInBits() == 64 &&
60043 InVec.getConstantOperandAPInt(1) == 32) {
60044 SDValue Ext =
60045 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60046 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60047 }
60048
60049 return SDValue();
60050}
60051
60053 const X86Subtarget &Subtarget) {
60054 using namespace SDPatternMatch;
60055 EVT VT = N->getValueType(0);
60056 SDValue Src = N->getOperand(0);
60057 SDLoc DL(N);
60058
60059 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60060 // This occurs frequently in our masked scalar intrinsic code and our
60061 // floating point select lowering with AVX512.
60062 // TODO: SimplifyDemandedBits instead?
60063 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60064 isOneConstant(Src.getOperand(1)))
60065 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60066
60067 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60068 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60069 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60070 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60071 isNullConstant(Src.getOperand(1)))
60072 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60073 Src.getOperand(1));
60074
60075 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60076 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60077 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60078 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60079 if (Op.getValueType() != MVT::i64)
60080 return SDValue();
60081 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60082 if (Op.getOpcode() == Opc &&
60083 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60084 return Op.getOperand(0);
60085 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60086 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60087 if (Ld->getExtensionType() == Ext &&
60088 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60089 return Op;
60090 if (IsZeroExt) {
60091 KnownBits Known = DAG.computeKnownBits(Op);
60092 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60093 return Op;
60094 }
60095 return SDValue();
60096 };
60097
60098 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60099 return DAG.getBitcast(
60100 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60101 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60102
60103 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60104 return DAG.getBitcast(
60105 VT,
60106 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60107 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60108 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60109 }
60110
60111 if (Src.getOpcode() == ISD::BITCAST) {
60112 SDValue SrcOp = Src.getOperand(0);
60113 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60114 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60115 return DAG.getBitcast(
60116 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60117 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60118 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60119 return DAG.getBitcast(
60120 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60121 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60122 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60123 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60124 }
60125
60126 if (VT == MVT::v4i32) {
60127 SDValue HalfSrc;
60128 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60129 // to remove XMM->GPR->XMM moves.
60130 if (sd_match(Src, m_AnyExt(m_BitCast(
60131 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60132 return DAG.getBitcast(
60133 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60134 }
60135
60136 // See if we're broadcasting the scalar value, in which case just reuse that.
60137 // Ensure the same SDValue from the SDNode use is being used.
60138 if (VT.getScalarType() == Src.getValueType())
60139 for (SDNode *User : Src->users())
60140 if (User->getOpcode() == X86ISD::VBROADCAST &&
60141 Src == User->getOperand(0)) {
60142 unsigned SizeInBits = VT.getFixedSizeInBits();
60143 unsigned BroadcastSizeInBits =
60144 User->getValueSizeInBits(0).getFixedValue();
60145 if (BroadcastSizeInBits == SizeInBits)
60146 return SDValue(User, 0);
60147 if (BroadcastSizeInBits > SizeInBits)
60148 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60149 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60150 // coverage.
60151 }
60152
60153 // Check for cases where we've ended up with a scalarized shift, typically
60154 // during type legalization.
60155 switch (Src.getOpcode()) {
60156 case ISD::SHL:
60157 case ISD::SRL:
60158 case ISD::SRA:
60159 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60160 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60161 Src.hasOneUse()) {
60162 SDValue SrcVec =
60163 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60164 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60165 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60166 Amt->getZExtValue(), DAG);
60167 }
60168 }
60169 break;
60170 case ISD::FSHL:
60171 case ISD::FSHR:
60172 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60173 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60174 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60175 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60176 Src.hasOneUse()) {
60177 uint64_t AmtVal =
60178 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60179 SDValue SrcVec0 =
60180 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60181 SDValue SrcVec1 =
60182 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60183 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60184 DAG.getConstant(AmtVal, DL, VT));
60185 }
60186 }
60187 break;
60188 }
60189
60190 return SDValue();
60191}
60192
60193// Simplify PMULDQ and PMULUDQ operations.
60196 const X86Subtarget &Subtarget) {
60197 SDValue LHS = N->getOperand(0);
60198 SDValue RHS = N->getOperand(1);
60199
60200 // Canonicalize constant to RHS.
60203 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60204
60205 // Multiply by zero.
60206 // Don't return RHS as it may contain UNDEFs.
60207 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60208 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60209
60210 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60211 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60212 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60213 return SDValue(N, 0);
60214
60215 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60216 // convert it to any_extend_invec, due to the LegalOperations check, do the
60217 // conversion directly to a vector shuffle manually. This exposes combine
60218 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60219 // combineX86ShufflesRecursively on SSE4.1 targets.
60220 // FIXME: This is basically a hack around several other issues related to
60221 // ANY_EXTEND_VECTOR_INREG.
60222 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60223 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60224 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60225 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60226 SDLoc dl(N);
60227 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60228 LHS.getOperand(0), { 0, -1, 1, -1 });
60229 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60230 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60231 }
60232 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60233 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60234 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60235 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60236 SDLoc dl(N);
60237 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60238 RHS.getOperand(0), { 0, -1, 1, -1 });
60239 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60240 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60241 }
60242
60243 return SDValue();
60244}
60245
60246// Simplify VPMADDUBSW/VPMADDWD operations.
60249 MVT VT = N->getSimpleValueType(0);
60250 SDValue LHS = N->getOperand(0);
60251 SDValue RHS = N->getOperand(1);
60252 unsigned Opc = N->getOpcode();
60253 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60255 "Unexpected PMADD opcode");
60256
60257 // Multiply by zero.
60258 // Don't return LHS/RHS as it may contain UNDEFs.
60259 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60261 return DAG.getConstant(0, SDLoc(N), VT);
60262
60263 // Constant folding.
60264 APInt LHSUndefs, RHSUndefs;
60265 SmallVector<APInt> LHSBits, RHSBits;
60266 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60267 unsigned DstEltBits = VT.getScalarSizeInBits();
60268 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60269 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60270 SmallVector<APInt> Result;
60271 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60272 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60273 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60274 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60275 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60276 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60277 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60278 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60279 Result.push_back(Res);
60280 }
60281 return getConstVector(Result, VT, DAG, SDLoc(N));
60282 }
60283
60284 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60285 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60286 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60287 return SDValue(N, 0);
60288
60289 return SDValue();
60290}
60291
60292// Simplify VPMADD52L/VPMADD52H operations.
60295 MVT VT = N->getSimpleValueType(0);
60296
60297 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60298 SDValue Op0 = N->getOperand(0);
60299 SDValue Op1 = N->getOperand(1);
60300 SDValue Op2 = N->getOperand(2);
60301 SDLoc DL(N);
60302
60303 APInt C0, C1;
60304 bool HasC0 = X86::isConstantSplat(Op0, C0),
60305 HasC1 = X86::isConstantSplat(Op1, C1);
60306
60307 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60308 if (HasC0 && !HasC1)
60309 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60310
60311 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60312 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60313 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60314 if (KnownOp0.countMinLeadingZeros() >= 12)
60315 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60316 }
60317
60318 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60319 unsigned NumEltBits = VT.getScalarSizeInBits();
60320 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60321 DCI))
60322 return SDValue(N, 0);
60323
60324 return SDValue();
60325}
60326
60329 const X86Subtarget &Subtarget) {
60330 EVT VT = N->getValueType(0);
60331 SDValue In = N->getOperand(0);
60332 unsigned Opcode = N->getOpcode();
60333 unsigned InOpcode = In.getOpcode();
60334 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60335 SDLoc DL(N);
60336
60337 // Try to merge vector loads and extend_inreg to an extload.
60338 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60339 In.hasOneUse()) {
60340 auto *Ld = cast<LoadSDNode>(In);
60341 if (Ld->isSimple()) {
60342 MVT SVT = In.getSimpleValueType().getVectorElementType();
60345 : ISD::ZEXTLOAD;
60346 EVT MemVT = VT.changeVectorElementType(SVT);
60347 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60348 SDValue Load = DAG.getExtLoad(
60349 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60350 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60351 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60352 return Load;
60353 }
60354 }
60355 }
60356
60357 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60358 if (Opcode == InOpcode)
60359 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60360
60361 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60362 // -> EXTEND_VECTOR_INREG(X).
60363 // TODO: Handle non-zero subvector indices.
60364 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60365 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60366 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60367 In.getValueSizeInBits())
60368 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60369
60370 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60371 // TODO: Move to DAGCombine?
60372 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60373 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60374 In.getValueSizeInBits() == VT.getSizeInBits()) {
60375 unsigned NumElts = VT.getVectorNumElements();
60376 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60377 EVT EltVT = In.getOperand(0).getValueType();
60378 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60379 for (unsigned I = 0; I != NumElts; ++I)
60380 Elts[I * Scale] = In.getOperand(I);
60381 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60382 }
60383
60384 // Attempt to combine as a shuffle on SSE41+ targets.
60385 if (Subtarget.hasSSE41()) {
60386 SDValue Op(N, 0);
60387 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60388 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60389 return Res;
60390 }
60391
60392 return SDValue();
60393}
60394
60397 EVT VT = N->getValueType(0);
60398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60399 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60400 return DAG.getConstant(0, SDLoc(N), VT);
60401
60402 // Fold kshiftr(extract_subvector(X,C1),C2)
60403 // --> extract_subvector(kshiftr(X,C1+C2),0)
60404 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60405 if (N->getOpcode() == X86ISD::KSHIFTR) {
60406 SDLoc DL(N);
60407 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60408 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60409 SDValue Src = N->getOperand(0).getOperand(0);
60410 uint64_t Amt = N->getConstantOperandVal(1) +
60411 N->getOperand(0).getConstantOperandVal(1);
60412 EVT SrcVT = Src.getValueType();
60413 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60414 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60415 DAG.getTargetConstant(Amt, DL, MVT::i8));
60416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60417 DAG.getVectorIdxConstant(0, DL));
60418 }
60419 }
60420 }
60421
60422 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60423 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60424 return SDValue(N, 0);
60425
60426 return SDValue();
60427}
60428
60429// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60430// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60431// extra instructions between the conversion due to going to scalar and back.
60433 const X86Subtarget &Subtarget) {
60434 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60435 return SDValue();
60436
60437 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60438 return SDValue();
60439
60440 if (N->getValueType(0) != MVT::f32 ||
60441 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60442 return SDValue();
60443
60444 SDLoc dl(N);
60445 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60446 N->getOperand(0).getOperand(0));
60447 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60448 DAG.getTargetConstant(4, dl, MVT::i32));
60449 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60451 DAG.getVectorIdxConstant(0, dl));
60452}
60453
60456 const X86Subtarget &Subtarget) {
60457 EVT VT = N->getValueType(0);
60458 bool IsStrict = N->isStrictFPOpcode();
60459 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60460 EVT SrcVT = Src.getValueType();
60461
60462 SDLoc dl(N);
60463 if (SrcVT.getScalarType() == MVT::bf16) {
60464 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60465 !IsStrict && Src.getOperand(0).getValueType() == VT)
60466 return Src.getOperand(0);
60467
60468 if (!SrcVT.isVector())
60469 return SDValue();
60470
60471 assert(!IsStrict && "Strict FP doesn't support BF16");
60472 if (VT.getVectorElementType() == MVT::f64) {
60473 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60474 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60475 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60476 }
60477 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60478 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60479 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60480 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60481 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60482 return DAG.getBitcast(VT, Src);
60483 }
60484
60485 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60486 return SDValue();
60487
60488 if (Subtarget.hasFP16())
60489 return SDValue();
60490
60491 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60492 return SDValue();
60493
60494 if (VT.getVectorElementType() != MVT::f32 &&
60495 VT.getVectorElementType() != MVT::f64)
60496 return SDValue();
60497
60498 unsigned NumElts = VT.getVectorNumElements();
60499 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60500 return SDValue();
60501
60502 // Convert the input to vXi16.
60503 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60504 Src = DAG.getBitcast(IntVT, Src);
60505
60506 // Widen to at least 8 input elements.
60507 if (NumElts < 8) {
60508 unsigned NumConcats = 8 / NumElts;
60509 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60510 : DAG.getConstant(0, dl, IntVT);
60511 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60512 Ops[0] = Src;
60513 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60514 }
60515
60516 // Destination is vXf32 with at least 4 elements.
60517 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60518 std::max(4U, NumElts));
60519 SDValue Cvt, Chain;
60520 if (IsStrict) {
60521 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60522 {N->getOperand(0), Src});
60523 Chain = Cvt.getValue(1);
60524 } else {
60525 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60526 }
60527
60528 if (NumElts < 4) {
60529 assert(NumElts == 2 && "Unexpected size");
60530 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60531 DAG.getVectorIdxConstant(0, dl));
60532 }
60533
60534 if (IsStrict) {
60535 // Extend to the original VT if necessary.
60536 if (Cvt.getValueType() != VT) {
60537 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60538 {Chain, Cvt});
60539 Chain = Cvt.getValue(1);
60540 }
60541 return DAG.getMergeValues({Cvt, Chain}, dl);
60542 }
60543
60544 // Extend to the original VT if necessary.
60545 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60546}
60547
60548// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60551 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60552 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60553 "Unknown broadcast load type");
60554
60555 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60556 SDValue Ptr = MemIntrin->getBasePtr();
60557 SDValue Chain = MemIntrin->getChain();
60558 EVT VT = N->getSimpleValueType(0);
60559 EVT MemVT = MemIntrin->getMemoryVT();
60560
60561 // Look at other users of our base pointer and try to find a wider broadcast.
60562 // The input chain and the size of the memory VT must match.
60563 for (SDNode *User : Ptr->users())
60564 if (User != N && User->getOpcode() == N->getOpcode() &&
60565 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60566 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60567 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60568 MemVT.getSizeInBits() &&
60569 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60571 MemIntrin->isSimple() && "Illegal broadcast load type");
60573 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60574 VT.getSizeInBits());
60575 Extract = DAG.getBitcast(VT, Extract);
60576 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60577 return Extract;
60578 }
60579
60580 return SDValue();
60581}
60582
60584 const X86Subtarget &Subtarget) {
60585 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60586 return SDValue();
60587
60588 bool IsStrict = N->isStrictFPOpcode();
60589 EVT VT = N->getValueType(0);
60590 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60591 EVT SrcVT = Src.getValueType();
60592
60593 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60594 SrcVT.getVectorElementType() != MVT::f32)
60595 return SDValue();
60596
60597 SDLoc dl(N);
60598
60599 SDValue Cvt, Chain;
60600 unsigned NumElts = VT.getVectorNumElements();
60601 if (Subtarget.hasFP16()) {
60602 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60603 // v4f32 (xint_to_fp v4i64))))
60604 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60605 // v8f16 (CVTXI2P v4i64)))
60606 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60607 Src.getNumOperands() == 2) {
60608 SDValue Cvt0, Cvt1;
60609 SDValue Op0 = Src.getOperand(0);
60610 SDValue Op1 = Src.getOperand(1);
60611 bool IsOp0Strict = Op0->isStrictFPOpcode();
60612 if (Op0.getOpcode() != Op1.getOpcode() ||
60613 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60614 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60615 return SDValue();
60616 }
60617 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60618 if (IsStrict) {
60619 assert(IsOp0Strict && "Op0 must be strict node");
60620 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60623 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60624 {Op0.getOperand(0), Op0.getOperand(1)});
60625 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60626 {Op1.getOperand(0), Op1.getOperand(1)});
60627 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60628 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60629 }
60630 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60632 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60633 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60634 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60635 }
60636 return SDValue();
60637 }
60638
60639 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60640 return SDValue();
60641
60642 // Widen to at least 4 input elements.
60643 if (NumElts < 4)
60644 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60645 DAG.getConstantFP(0.0, dl, SrcVT));
60646
60647 // Destination is v8i16 with at least 8 elements.
60648 EVT CvtVT =
60649 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60650 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60651 if (IsStrict) {
60652 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60653 {N->getOperand(0), Src, Rnd});
60654 Chain = Cvt.getValue(1);
60655 } else {
60656 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60657 }
60658
60659 // Extract down to real number of elements.
60660 if (NumElts < 8) {
60662 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60663 DAG.getVectorIdxConstant(0, dl));
60664 }
60665
60666 Cvt = DAG.getBitcast(VT, Cvt);
60667
60668 if (IsStrict)
60669 return DAG.getMergeValues({Cvt, Chain}, dl);
60670
60671 return Cvt;
60672}
60673
60675 SDValue Src = N->getOperand(0);
60676
60677 // Turn MOVDQ2Q+simple_load into an mmx load.
60678 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60679 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60680
60681 if (LN->isSimple()) {
60682 SDValue NewLd =
60683 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60684 LN->getPointerInfo(), LN->getBaseAlign(),
60685 LN->getMemOperand()->getFlags());
60686 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60687 return NewLd;
60688 }
60689 }
60690
60691 return SDValue();
60692}
60693
60696 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60697 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60698 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60699 return SDValue(N, 0);
60700
60701 return SDValue();
60702}
60703
60704// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60705// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60706// use x86mmx instead.
60708 SDLoc dl(N);
60709
60710 bool MadeChange = false, CastReturnVal = false;
60712 for (const SDValue &Arg : N->op_values()) {
60713 if (Arg.getValueType() == MVT::v1i64) {
60714 MadeChange = true;
60715 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60716 } else
60717 Args.push_back(Arg);
60718 }
60719 SDVTList VTs = N->getVTList();
60720 SDVTList NewVTs = VTs;
60721 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60722 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60723 NewVTArr[0] = MVT::x86mmx;
60724 NewVTs = DAG.getVTList(NewVTArr);
60725 MadeChange = true;
60726 CastReturnVal = true;
60727 }
60728
60729 if (MadeChange) {
60730 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60731 if (CastReturnVal) {
60733 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60734 Returns.push_back(Result.getValue(i));
60735 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60736 return DAG.getMergeValues(Returns, dl);
60737 }
60738 return Result;
60739 }
60740 return SDValue();
60741}
60744 if (!DCI.isBeforeLegalize())
60745 return SDValue();
60746
60747 unsigned IntNo = N->getConstantOperandVal(0);
60748 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60749
60750 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60751 return FixupMMXIntrinsicTypes(N, DAG);
60752
60753 return SDValue();
60754}
60755
60758 if (!DCI.isBeforeLegalize())
60759 return SDValue();
60760
60761 unsigned IntNo = N->getConstantOperandVal(1);
60762 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60763
60764 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60765 return FixupMMXIntrinsicTypes(N, DAG);
60766
60767 return SDValue();
60768}
60769
60772 if (!DCI.isBeforeLegalize())
60773 return SDValue();
60774
60775 unsigned IntNo = N->getConstantOperandVal(1);
60776 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60777
60778 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60779 return FixupMMXIntrinsicTypes(N, DAG);
60780
60781 return SDValue();
60782}
60783
60785 DAGCombinerInfo &DCI) const {
60786 SelectionDAG &DAG = DCI.DAG;
60787 switch (N->getOpcode()) {
60788 // clang-format off
60789 default: break;
60791 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60793 case X86ISD::PEXTRW:
60794 case X86ISD::PEXTRB:
60795 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60797 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60799 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60801 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60802 case ISD::VSELECT:
60803 case ISD::SELECT:
60804 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60805 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60806 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60807 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60808 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60809 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60810 case X86ISD::ADD:
60811 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60812 case X86ISD::CLOAD:
60813 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60814 case X86ISD::SBB: return combineSBB(N, DAG);
60815 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60816 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60817 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60818 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60819 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60820 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60821 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60822 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60823 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60824 case ISD::AVGCEILS:
60825 case ISD::AVGCEILU:
60826 case ISD::AVGFLOORS:
60827 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60828 case X86ISD::BEXTR:
60829 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60830 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60831 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60832 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60833 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60835 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60836 case ISD::SINT_TO_FP:
60838 return combineSIntToFP(N, DAG, DCI, Subtarget);
60839 case ISD::UINT_TO_FP:
60841 return combineUIntToFP(N, DAG, Subtarget);
60842 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60843 case ISD::LRINT:
60844 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60845 case ISD::FADD:
60846 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60847 case X86ISD::VFCMULC:
60848 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60849 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60850 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60851 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60852 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60853 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60854 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60855 case X86ISD::FXOR:
60856 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60857 case X86ISD::FMIN:
60858 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60859 case ISD::FMINNUM:
60860 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60861 case X86ISD::CVTSI2P:
60862 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60863 case X86ISD::CVTP2SI:
60864 case X86ISD::CVTP2UI:
60866 case X86ISD::CVTTP2SI:
60868 case X86ISD::CVTTP2UI:
60869 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60871 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60872 case X86ISD::BT: return combineBT(N, DAG, DCI);
60873 case ISD::ANY_EXTEND:
60874 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60875 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60876 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60880 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60881 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60882 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60883 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60884 case X86ISD::PACKSS:
60885 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60886 case X86ISD::HADD:
60887 case X86ISD::HSUB:
60888 case X86ISD::FHADD:
60889 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60890 case X86ISD::VSHL:
60891 case X86ISD::VSRA:
60892 case X86ISD::VSRL:
60893 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60894 case X86ISD::VSHLI:
60895 case X86ISD::VSRAI:
60896 case X86ISD::VSRLI:
60897 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60899 case X86ISD::PINSRB:
60900 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60901 case X86ISD::SHUFP: // Handle all target specific shuffles
60902 case X86ISD::INSERTPS:
60903 case X86ISD::EXTRQI:
60904 case X86ISD::INSERTQI:
60905 case X86ISD::VALIGN:
60906 case X86ISD::PALIGNR:
60907 case X86ISD::VSHLDQ:
60908 case X86ISD::VSRLDQ:
60909 case X86ISD::BLENDI:
60910 case X86ISD::UNPCKH:
60911 case X86ISD::UNPCKL:
60912 case X86ISD::MOVHLPS:
60913 case X86ISD::MOVLHPS:
60914 case X86ISD::PSHUFB:
60915 case X86ISD::PSHUFD:
60916 case X86ISD::PSHUFHW:
60917 case X86ISD::PSHUFLW:
60918 case X86ISD::MOVSHDUP:
60919 case X86ISD::MOVSLDUP:
60920 case X86ISD::MOVDDUP:
60921 case X86ISD::MOVSS:
60922 case X86ISD::MOVSD:
60923 case X86ISD::MOVSH:
60924 case X86ISD::VBROADCAST:
60925 case X86ISD::VPPERM:
60926 case X86ISD::VPERMI:
60927 case X86ISD::VPERMV:
60928 case X86ISD::VPERMV3:
60929 case X86ISD::VPERMIL2:
60930 case X86ISD::VPERMILPI:
60931 case X86ISD::VPERMILPV:
60932 case X86ISD::VPERM2X128:
60933 case X86ISD::SHUF128:
60934 case X86ISD::VZEXT_MOVL:
60935 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60936 case X86ISD::FMADD_RND:
60937 case X86ISD::FMSUB:
60939 case X86ISD::FMSUB_RND:
60940 case X86ISD::FNMADD:
60942 case X86ISD::FNMADD_RND:
60943 case X86ISD::FNMSUB:
60945 case X86ISD::FNMSUB_RND:
60946 case ISD::FMA:
60947 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60950 case X86ISD::FMADDSUB:
60951 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60952 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60953 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60954 case X86ISD::MGATHER:
60955 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60956 case ISD::MGATHER:
60957 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60958 case X86ISD::PCMPEQ:
60959 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60960 case X86ISD::PMULDQ:
60961 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60962 case X86ISD::VPMADDUBSW:
60963 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60964 case X86ISD::VPMADD52L:
60965 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60966 case X86ISD::KSHIFTL:
60967 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60968 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60970 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60972 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60974 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60975 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60976 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60977 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60978 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60979 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60981 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60982 // clang-format on
60983 }
60984
60985 return SDValue();
60986}
60987
60989 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60990}
60991
60992// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60994 EVT ExtVT) const {
60995 return Subtarget.hasAVX512() || !VT.isVector();
60996}
60997
60999 if (!isTypeLegal(VT))
61000 return false;
61001
61002 // There are no vXi8 shifts.
61003 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
61004 return false;
61005
61006 // TODO: Almost no 8-bit ops are desirable because they have no actual
61007 // size/speed advantages vs. 32-bit ops, but they do have a major
61008 // potential disadvantage by causing partial register stalls.
61009 //
61010 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
61011 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
61012 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
61013 // check for a constant operand to the multiply.
61014 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
61015 return false;
61016
61017 // i16 instruction encodings are longer and some i16 instructions are slow,
61018 // so those are not desirable.
61019 if (VT == MVT::i16) {
61020 switch (Opc) {
61021 default:
61022 break;
61023 case ISD::LOAD:
61024 case ISD::SIGN_EXTEND:
61025 case ISD::ZERO_EXTEND:
61026 case ISD::ANY_EXTEND:
61027 case ISD::MUL:
61028 return false;
61029 case ISD::SHL:
61030 case ISD::SRA:
61031 case ISD::SRL:
61032 case ISD::SUB:
61033 case ISD::ADD:
61034 case ISD::AND:
61035 case ISD::OR:
61036 case ISD::XOR:
61037 // NDD instruction never has "partial register write" issue b/c it has
61038 // destination register's upper bits [63:OSIZE]) zeroed even when
61039 // OSIZE=8/16.
61040 return Subtarget.hasNDD();
61041 }
61042 }
61043
61044 // Any legal type not explicitly accounted for above here is desirable.
61045 return true;
61046}
61047
61049 SDValue Value, SDValue Addr,
61050 int JTI,
61051 SelectionDAG &DAG) const {
61052 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61053 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61054 if (IsCFProtectionSupported) {
61055 // In case control-flow branch protection is enabled, we need to add
61056 // notrack prefix to the indirect branch.
61057 // In order to do that we create NT_BRIND SDNode.
61058 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61059 SDValue Chain = Value;
61060 // Jump table debug info is only needed if CodeView is enabled.
61062 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61063 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61064 }
61065
61066 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61067}
61068
61071 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61073 EVT VT = LogicOp->getValueType(0);
61074 EVT OpVT = SETCC0->getOperand(0).getValueType();
61075 if (!VT.isInteger())
61077
61078 if (VT.isVector())
61083
61084 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61085 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61086 // `NotAnd` applies, `AddAnd` does as well.
61087 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61088 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61090}
61091
61093 EVT VT = Op.getValueType();
61094 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61095 isa<ConstantSDNode>(Op.getOperand(1));
61096
61097 // i16 is legal, but undesirable since i16 instruction encodings are longer
61098 // and some i16 instructions are slow.
61099 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61100 // using LEA and/or other ALU ops.
61101 if (VT != MVT::i16 && !Is8BitMulByConstant)
61102 return false;
61103
61104 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61105 if (!Op.hasOneUse())
61106 return false;
61107 SDNode *User = *Op->user_begin();
61109 return false;
61110 auto *Ld = cast<LoadSDNode>(Load);
61111 auto *St = cast<StoreSDNode>(User);
61112 return Ld->getBasePtr() == St->getBasePtr();
61113 };
61114
61115 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61116 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61117 return false;
61118 if (!Op.hasOneUse())
61119 return false;
61120 SDNode *User = *Op->user_begin();
61121 if (User->getOpcode() != ISD::ATOMIC_STORE)
61122 return false;
61123 auto *Ld = cast<AtomicSDNode>(Load);
61124 auto *St = cast<AtomicSDNode>(User);
61125 return Ld->getBasePtr() == St->getBasePtr();
61126 };
61127
61128 auto IsFoldableZext = [](SDValue Op) {
61129 if (!Op.hasOneUse())
61130 return false;
61131 SDNode *User = *Op->user_begin();
61132 EVT VT = User->getValueType(0);
61133 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61134 (VT == MVT::i32 || VT == MVT::i64));
61135 };
61136
61137 bool Commute = false;
61138 switch (Op.getOpcode()) {
61139 default: return false;
61140 case ISD::SIGN_EXTEND:
61141 case ISD::ZERO_EXTEND:
61142 case ISD::ANY_EXTEND:
61143 break;
61144 case ISD::SHL:
61145 case ISD::SRA:
61146 case ISD::SRL: {
61147 SDValue N0 = Op.getOperand(0);
61148 // Look out for (store (shl (load), x)).
61149 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61150 return false;
61151 break;
61152 }
61153 case ISD::MUL:
61154 // When ZU is enabled, we prefer to not promote for MUL by a constant
61155 // when there is an opportunity to fold a zext with imulzu.
61156 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61157 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61158 isa<ConstantSDNode>(Op.getOperand(1))))
61159 return false;
61160 [[fallthrough]];
61161 case ISD::ADD:
61162 case ISD::AND:
61163 case ISD::OR:
61164 case ISD::XOR:
61165 Commute = true;
61166 [[fallthrough]];
61167 case ISD::SUB: {
61168 SDValue N0 = Op.getOperand(0);
61169 SDValue N1 = Op.getOperand(1);
61170 // Avoid disabling potential load folding opportunities.
61171 if (X86::mayFoldLoad(N1, Subtarget) &&
61172 (!Commute || !isa<ConstantSDNode>(N0) ||
61173 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61174 return false;
61175 if (X86::mayFoldLoad(N0, Subtarget) &&
61176 ((Commute && !isa<ConstantSDNode>(N1)) ||
61177 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61178 return false;
61179 if (IsFoldableAtomicRMW(N0, Op) ||
61180 (Commute && IsFoldableAtomicRMW(N1, Op)))
61181 return false;
61182 }
61183 }
61184
61185 PVT = MVT::i32;
61186 return true;
61187}
61188
61189//===----------------------------------------------------------------------===//
61190// X86 Inline Assembly Support
61191//===----------------------------------------------------------------------===//
61192
61195 .Case("{@cca}", X86::COND_A)
61196 .Case("{@ccae}", X86::COND_AE)
61197 .Case("{@ccb}", X86::COND_B)
61198 .Case("{@ccbe}", X86::COND_BE)
61199 .Case("{@ccc}", X86::COND_B)
61200 .Case("{@cce}", X86::COND_E)
61201 .Case("{@ccz}", X86::COND_E)
61202 .Case("{@ccg}", X86::COND_G)
61203 .Case("{@ccge}", X86::COND_GE)
61204 .Case("{@ccl}", X86::COND_L)
61205 .Case("{@ccle}", X86::COND_LE)
61206 .Case("{@ccna}", X86::COND_BE)
61207 .Case("{@ccnae}", X86::COND_B)
61208 .Case("{@ccnb}", X86::COND_AE)
61209 .Case("{@ccnbe}", X86::COND_A)
61210 .Case("{@ccnc}", X86::COND_AE)
61211 .Case("{@ccne}", X86::COND_NE)
61212 .Case("{@ccnz}", X86::COND_NE)
61213 .Case("{@ccng}", X86::COND_LE)
61214 .Case("{@ccnge}", X86::COND_L)
61215 .Case("{@ccnl}", X86::COND_GE)
61216 .Case("{@ccnle}", X86::COND_G)
61217 .Case("{@ccno}", X86::COND_NO)
61218 .Case("{@ccnp}", X86::COND_NP)
61219 .Case("{@ccns}", X86::COND_NS)
61220 .Case("{@cco}", X86::COND_O)
61221 .Case("{@ccp}", X86::COND_P)
61222 .Case("{@ccs}", X86::COND_S)
61224 return Cond;
61225}
61226
61227/// Given a constraint letter, return the type of constraint for this target.
61230 if (Constraint.size() == 1) {
61231 switch (Constraint[0]) {
61232 case 'R':
61233 case 'q':
61234 case 'Q':
61235 case 'f':
61236 case 't':
61237 case 'u':
61238 case 'y':
61239 case 'x':
61240 case 'v':
61241 case 'l':
61242 case 'k': // AVX512 masking registers.
61243 return C_RegisterClass;
61244 case 'a':
61245 case 'b':
61246 case 'c':
61247 case 'd':
61248 case 'S':
61249 case 'D':
61250 case 'A':
61251 return C_Register;
61252 case 'I':
61253 case 'J':
61254 case 'K':
61255 case 'N':
61256 case 'G':
61257 case 'L':
61258 case 'M':
61259 return C_Immediate;
61260 case 'C':
61261 case 'e':
61262 case 'Z':
61263 return C_Other;
61264 default:
61265 break;
61266 }
61267 }
61268 else if (Constraint.size() == 2) {
61269 switch (Constraint[0]) {
61270 default:
61271 break;
61272 case 'W':
61273 if (Constraint[1] != 's')
61274 break;
61275 return C_Other;
61276 case 'Y':
61277 switch (Constraint[1]) {
61278 default:
61279 break;
61280 case 'z':
61281 return C_Register;
61282 case 'i':
61283 case 'm':
61284 case 'k':
61285 case 't':
61286 case '2':
61287 return C_RegisterClass;
61288 }
61289 break;
61290 case 'j':
61291 switch (Constraint[1]) {
61292 default:
61293 break;
61294 case 'r':
61295 case 'R':
61296 return C_RegisterClass;
61297 }
61298 }
61299 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61300 return C_Other;
61301 return TargetLowering::getConstraintType(Constraint);
61302}
61303
61304/// Examine constraint type and operand type and determine a weight value.
61305/// This object must already have been set up with the operand type
61306/// and the current alternative constraint selected.
61309 AsmOperandInfo &Info, const char *Constraint) const {
61311 Value *CallOperandVal = Info.CallOperandVal;
61312 // If we don't have a value, we can't do a match,
61313 // but allow it at the lowest weight.
61314 if (!CallOperandVal)
61315 return CW_Default;
61316 Type *Ty = CallOperandVal->getType();
61317 // Look at the constraint type.
61318 switch (*Constraint) {
61319 default:
61321 [[fallthrough]];
61322 case 'R':
61323 case 'q':
61324 case 'Q':
61325 case 'a':
61326 case 'b':
61327 case 'c':
61328 case 'd':
61329 case 'S':
61330 case 'D':
61331 case 'A':
61332 if (CallOperandVal->getType()->isIntegerTy())
61333 Wt = CW_SpecificReg;
61334 break;
61335 case 'f':
61336 case 't':
61337 case 'u':
61338 if (Ty->isFloatingPointTy())
61339 Wt = CW_SpecificReg;
61340 break;
61341 case 'y':
61342 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61343 Wt = CW_SpecificReg;
61344 break;
61345 case 'Y':
61346 if (StringRef(Constraint).size() != 2)
61347 break;
61348 switch (Constraint[1]) {
61349 default:
61350 return CW_Invalid;
61351 // XMM0
61352 case 'z':
61353 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61354 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61355 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61356 return CW_SpecificReg;
61357 return CW_Invalid;
61358 // Conditional OpMask regs (AVX512)
61359 case 'k':
61360 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61361 return CW_Register;
61362 return CW_Invalid;
61363 // Any MMX reg
61364 case 'm':
61365 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61366 return CW_SpecificReg;
61367 return CW_Invalid;
61368 // Any SSE reg when ISA >= SSE2, same as 'x'
61369 case 'i':
61370 case 't':
61371 case '2':
61372 if (!Subtarget.hasSSE2())
61373 return CW_Invalid;
61374 break;
61375 }
61376 break;
61377 case 'j':
61378 if (StringRef(Constraint).size() != 2)
61379 break;
61380 switch (Constraint[1]) {
61381 default:
61382 return CW_Invalid;
61383 case 'r':
61384 case 'R':
61385 if (CallOperandVal->getType()->isIntegerTy())
61386 Wt = CW_SpecificReg;
61387 break;
61388 }
61389 break;
61390 case 'v':
61391 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61392 Wt = CW_Register;
61393 [[fallthrough]];
61394 case 'x':
61395 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61396 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61397 Wt = CW_Register;
61398 break;
61399 case 'k':
61400 // Enable conditional vector operations using %k<#> registers.
61401 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61402 Wt = CW_Register;
61403 break;
61404 case 'I':
61405 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61406 if (C->getZExtValue() <= 31)
61407 Wt = CW_Constant;
61408 break;
61409 case 'J':
61410 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61411 if (C->getZExtValue() <= 63)
61412 Wt = CW_Constant;
61413 break;
61414 case 'K':
61415 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61416 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61417 Wt = CW_Constant;
61418 break;
61419 case 'L':
61420 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61421 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61422 Wt = CW_Constant;
61423 break;
61424 case 'M':
61425 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61426 if (C->getZExtValue() <= 3)
61427 Wt = CW_Constant;
61428 break;
61429 case 'N':
61430 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61431 if (C->getZExtValue() <= 0xff)
61432 Wt = CW_Constant;
61433 break;
61434 case 'G':
61435 case 'C':
61436 if (isa<ConstantFP>(CallOperandVal))
61437 Wt = CW_Constant;
61438 break;
61439 case 'e':
61440 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61441 if ((C->getSExtValue() >= -0x80000000LL) &&
61442 (C->getSExtValue() <= 0x7fffffffLL))
61443 Wt = CW_Constant;
61444 break;
61445 case 'Z':
61446 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61447 if (C->getZExtValue() <= 0xffffffff)
61448 Wt = CW_Constant;
61449 break;
61450 }
61451 return Wt;
61452}
61453
61454/// Try to replace an X constraint, which matches anything, with another that
61455/// has more specific requirements based on the type of the corresponding
61456/// operand.
61458LowerXConstraint(EVT ConstraintVT) const {
61459 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61460 // 'f' like normal targets.
61461 if (ConstraintVT.isFloatingPoint()) {
61462 if (Subtarget.hasSSE1())
61463 return "x";
61464 }
61465
61466 return TargetLowering::LowerXConstraint(ConstraintVT);
61467}
61468
61469// Lower @cc targets via setcc.
61471 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61472 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61473 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61474 if (Cond == X86::COND_INVALID)
61475 return SDValue();
61476 // Check that return type is valid.
61477 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61478 OpInfo.ConstraintVT.getSizeInBits() < 8)
61479 report_fatal_error("Glue output operand is of invalid type");
61480
61481 // Get EFLAGS register. Only update chain when copyfrom is glued.
61482 if (Glue.getNode()) {
61483 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61484 Chain = Glue.getValue(1);
61485 } else
61486 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61487 // Extract CC code.
61488 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61489 // Extend to 32-bits
61490 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61491
61492 return Result;
61493}
61494
61495/// Lower the specified operand into the Ops vector.
61496/// If it is invalid, don't add anything to Ops.
61498 StringRef Constraint,
61499 std::vector<SDValue> &Ops,
61500 SelectionDAG &DAG) const {
61501 SDValue Result;
61502 char ConstraintLetter = Constraint[0];
61503 switch (ConstraintLetter) {
61504 default: break;
61505 case 'I':
61506 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61507 if (C->getZExtValue() <= 31) {
61508 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61509 Op.getValueType());
61510 break;
61511 }
61512 }
61513 return;
61514 case 'J':
61515 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61516 if (C->getZExtValue() <= 63) {
61517 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61518 Op.getValueType());
61519 break;
61520 }
61521 }
61522 return;
61523 case 'K':
61524 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61525 if (isInt<8>(C->getSExtValue())) {
61526 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61527 Op.getValueType());
61528 break;
61529 }
61530 }
61531 return;
61532 case 'L':
61533 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61534 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61535 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61536 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61537 Op.getValueType());
61538 break;
61539 }
61540 }
61541 return;
61542 case 'M':
61543 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61544 if (C->getZExtValue() <= 3) {
61545 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61546 Op.getValueType());
61547 break;
61548 }
61549 }
61550 return;
61551 case 'N':
61552 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61553 if (C->getZExtValue() <= 255) {
61554 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61555 Op.getValueType());
61556 break;
61557 }
61558 }
61559 return;
61560 case 'O':
61561 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61562 if (C->getZExtValue() <= 127) {
61563 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61564 Op.getValueType());
61565 break;
61566 }
61567 }
61568 return;
61569 case 'e': {
61570 // 32-bit signed value
61571 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61573 C->getSExtValue())) {
61574 // Widen to 64 bits here to get it sign extended.
61575 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61576 break;
61577 }
61578 // FIXME gcc accepts some relocatable values here too, but only in certain
61579 // memory models; it's complicated.
61580 }
61581 return;
61582 }
61583 case 'W': {
61584 assert(Constraint[1] == 's');
61585 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61586 // offset.
61587 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61588 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61589 BA->getValueType(0)));
61590 } else {
61591 int64_t Offset = 0;
61592 if (Op->getOpcode() == ISD::ADD &&
61593 isa<ConstantSDNode>(Op->getOperand(1))) {
61594 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61595 Op = Op->getOperand(0);
61596 }
61597 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61598 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61599 GA->getValueType(0), Offset));
61600 }
61601 return;
61602 }
61603 case 'Z': {
61604 // 32-bit unsigned value
61605 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61607 C->getZExtValue())) {
61608 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61609 Op.getValueType());
61610 break;
61611 }
61612 }
61613 // FIXME gcc accepts some relocatable values here too, but only in certain
61614 // memory models; it's complicated.
61615 return;
61616 }
61617 case 'i': {
61618 // Literal immediates are always ok.
61619 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61620 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61621 BooleanContent BCont = getBooleanContents(MVT::i64);
61622 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61624 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61625 : CST->getSExtValue();
61626 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61627 break;
61628 }
61629
61630 // In any sort of PIC mode addresses need to be computed at runtime by
61631 // adding in a register or some sort of table lookup. These can't
61632 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61633 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61635 return;
61636
61637 // If we are in non-pic codegen mode, we allow the address of a global (with
61638 // an optional displacement) to be used with 'i'.
61639 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61640 // If we require an extra load to get this address, as in PIC mode, we
61641 // can't accept it.
61643 Subtarget.classifyGlobalReference(GA->getGlobal())))
61644 return;
61645 break;
61646 }
61647 }
61648
61649 if (Result.getNode()) {
61650 Ops.push_back(Result);
61651 return;
61652 }
61653 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61654}
61655
61656/// Check if \p RC is a general purpose register class.
61657/// I.e., GR* or one of their variant.
61658static bool isGRClass(const TargetRegisterClass &RC) {
61659 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61660 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61661 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61662 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61663 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61664}
61665
61666/// Check if \p RC is a vector register class.
61667/// I.e., FR* / VR* or one of their variant.
61668static bool isFRClass(const TargetRegisterClass &RC) {
61669 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61670 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61671 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61672 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61673 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61674 RC.hasSuperClassEq(&X86::VR512RegClass);
61675}
61676
61677/// Check if \p RC is a mask register class.
61678/// I.e., VK* or one of their variant.
61679static bool isVKClass(const TargetRegisterClass &RC) {
61680 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61681 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61682 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61683 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61684 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61685 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61686 RC.hasSuperClassEq(&X86::VK64RegClass);
61687}
61688
61689static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61690 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61691}
61692
61693std::pair<unsigned, const TargetRegisterClass *>
61695 StringRef Constraint,
61696 MVT VT) const {
61697 // First, see if this is a constraint that directly corresponds to an LLVM
61698 // register class.
61699 if (Constraint.size() == 1) {
61700 // GCC Constraint Letters
61701 switch (Constraint[0]) {
61702 default: break;
61703 // 'A' means [ER]AX + [ER]DX.
61704 case 'A':
61705 if (Subtarget.is64Bit())
61706 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61707 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61708 "Expecting 64, 32 or 16 bit subtarget");
61709 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61710
61711 // TODO: Slight differences here in allocation order and leaving
61712 // RIP in the class. Do they matter any more here than they do
61713 // in the normal allocation?
61714 case 'k':
61715 if (Subtarget.hasAVX512()) {
61716 if (VT == MVT::v1i1 || VT == MVT::i1)
61717 return std::make_pair(0U, &X86::VK1RegClass);
61718 if (VT == MVT::v8i1 || VT == MVT::i8)
61719 return std::make_pair(0U, &X86::VK8RegClass);
61720 if (VT == MVT::v16i1 || VT == MVT::i16)
61721 return std::make_pair(0U, &X86::VK16RegClass);
61722 }
61723 if (Subtarget.hasBWI()) {
61724 if (VT == MVT::v32i1 || VT == MVT::i32)
61725 return std::make_pair(0U, &X86::VK32RegClass);
61726 if (VT == MVT::v64i1 || VT == MVT::i64)
61727 return std::make_pair(0U, &X86::VK64RegClass);
61728 }
61729 break;
61730 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61731 if (Subtarget.is64Bit()) {
61732 if (VT == MVT::i8 || VT == MVT::i1)
61733 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61734 ? &X86::GR8RegClass
61735 : &X86::GR8_NOREX2RegClass);
61736 if (VT == MVT::i16)
61737 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61738 ? &X86::GR16RegClass
61739 : &X86::GR16_NOREX2RegClass);
61740 if (VT == MVT::i32 || VT == MVT::f32)
61741 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61742 ? &X86::GR32RegClass
61743 : &X86::GR32_NOREX2RegClass);
61744 if (VT != MVT::f80 && !VT.isVector())
61745 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61746 ? &X86::GR64RegClass
61747 : &X86::GR64_NOREX2RegClass);
61748 break;
61749 }
61750 [[fallthrough]];
61751 // 32-bit fallthrough
61752 case 'Q': // Q_REGS
61753 if (VT == MVT::i8 || VT == MVT::i1)
61754 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61755 if (VT == MVT::i16)
61756 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61757 if (VT == MVT::i32 || VT == MVT::f32 ||
61758 (!VT.isVector() && !Subtarget.is64Bit()))
61759 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61760 if (VT != MVT::f80 && !VT.isVector())
61761 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61762 break;
61763 case 'r': // GENERAL_REGS
61764 case 'l': // INDEX_REGS
61765 if (VT == MVT::i8 || VT == MVT::i1)
61766 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61767 ? &X86::GR8RegClass
61768 : &X86::GR8_NOREX2RegClass);
61769 if (VT == MVT::i16)
61770 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61771 ? &X86::GR16RegClass
61772 : &X86::GR16_NOREX2RegClass);
61773 if (VT == MVT::i32 || VT == MVT::f32 ||
61774 (!VT.isVector() && !Subtarget.is64Bit()))
61775 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61776 ? &X86::GR32RegClass
61777 : &X86::GR32_NOREX2RegClass);
61778 if (VT != MVT::f80 && !VT.isVector())
61779 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61780 ? &X86::GR64RegClass
61781 : &X86::GR64_NOREX2RegClass);
61782 break;
61783 case 'R': // LEGACY_REGS
61784 if (VT == MVT::i8 || VT == MVT::i1)
61785 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61786 if (VT == MVT::i16)
61787 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61788 if (VT == MVT::i32 || VT == MVT::f32 ||
61789 (!VT.isVector() && !Subtarget.is64Bit()))
61790 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61791 if (VT != MVT::f80 && !VT.isVector())
61792 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61793 break;
61794 case 'f': // FP Stack registers.
61795 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61796 // value to the correct fpstack register class.
61797 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61798 return std::make_pair(0U, &X86::RFP32RegClass);
61799 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61800 return std::make_pair(0U, &X86::RFP64RegClass);
61801 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61802 return std::make_pair(0U, &X86::RFP80RegClass);
61803 break;
61804 case 'y': // MMX_REGS if MMX allowed.
61805 if (!Subtarget.hasMMX()) break;
61806 return std::make_pair(0U, &X86::VR64RegClass);
61807 case 'v':
61808 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61809 if (!Subtarget.hasSSE1()) break;
61810 bool VConstraint = (Constraint[0] == 'v');
61811
61812 switch (VT.SimpleTy) {
61813 default: break;
61814 // Scalar SSE types.
61815 case MVT::f16:
61816 if (VConstraint && Subtarget.hasFP16())
61817 return std::make_pair(0U, &X86::FR16XRegClass);
61818 break;
61819 case MVT::f32:
61820 case MVT::i32:
61821 if (VConstraint && Subtarget.hasVLX())
61822 return std::make_pair(0U, &X86::FR32XRegClass);
61823 return std::make_pair(0U, &X86::FR32RegClass);
61824 case MVT::f64:
61825 case MVT::i64:
61826 if (VConstraint && Subtarget.hasVLX())
61827 return std::make_pair(0U, &X86::FR64XRegClass);
61828 return std::make_pair(0U, &X86::FR64RegClass);
61829 case MVT::i128:
61830 if (Subtarget.is64Bit()) {
61831 if (VConstraint && Subtarget.hasVLX())
61832 return std::make_pair(0U, &X86::VR128XRegClass);
61833 return std::make_pair(0U, &X86::VR128RegClass);
61834 }
61835 break;
61836 // Vector types and fp128.
61837 case MVT::v8f16:
61838 if (!Subtarget.hasFP16())
61839 break;
61840 if (VConstraint)
61841 return std::make_pair(0U, &X86::VR128XRegClass);
61842 return std::make_pair(0U, &X86::VR128RegClass);
61843 case MVT::v8bf16:
61844 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61845 break;
61846 if (VConstraint)
61847 return std::make_pair(0U, &X86::VR128XRegClass);
61848 return std::make_pair(0U, &X86::VR128RegClass);
61849 case MVT::f128:
61850 if (!Subtarget.is64Bit())
61851 break;
61852 [[fallthrough]];
61853 case MVT::v16i8:
61854 case MVT::v8i16:
61855 case MVT::v4i32:
61856 case MVT::v2i64:
61857 case MVT::v4f32:
61858 case MVT::v2f64:
61859 if (VConstraint && Subtarget.hasVLX())
61860 return std::make_pair(0U, &X86::VR128XRegClass);
61861 return std::make_pair(0U, &X86::VR128RegClass);
61862 // AVX types.
61863 case MVT::v16f16:
61864 if (!Subtarget.hasFP16())
61865 break;
61866 if (VConstraint)
61867 return std::make_pair(0U, &X86::VR256XRegClass);
61868 return std::make_pair(0U, &X86::VR256RegClass);
61869 case MVT::v16bf16:
61870 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61871 break;
61872 if (VConstraint)
61873 return std::make_pair(0U, &X86::VR256XRegClass);
61874 return std::make_pair(0U, &X86::VR256RegClass);
61875 case MVT::v32i8:
61876 case MVT::v16i16:
61877 case MVT::v8i32:
61878 case MVT::v4i64:
61879 case MVT::v8f32:
61880 case MVT::v4f64:
61881 if (VConstraint && Subtarget.hasVLX())
61882 return std::make_pair(0U, &X86::VR256XRegClass);
61883 if (Subtarget.hasAVX())
61884 return std::make_pair(0U, &X86::VR256RegClass);
61885 break;
61886 case MVT::v32f16:
61887 if (!Subtarget.hasFP16())
61888 break;
61889 if (VConstraint)
61890 return std::make_pair(0U, &X86::VR512RegClass);
61891 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61892 case MVT::v32bf16:
61893 if (!Subtarget.hasBF16())
61894 break;
61895 if (VConstraint)
61896 return std::make_pair(0U, &X86::VR512RegClass);
61897 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61898 case MVT::v64i8:
61899 case MVT::v32i16:
61900 case MVT::v8f64:
61901 case MVT::v16f32:
61902 case MVT::v16i32:
61903 case MVT::v8i64:
61904 if (!Subtarget.hasAVX512()) break;
61905 if (VConstraint)
61906 return std::make_pair(0U, &X86::VR512RegClass);
61907 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61908 }
61909 break;
61910 }
61911 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61912 switch (Constraint[1]) {
61913 default:
61914 break;
61915 case 'i':
61916 case 't':
61917 case '2':
61918 return getRegForInlineAsmConstraint(TRI, "x", VT);
61919 case 'm':
61920 if (!Subtarget.hasMMX()) break;
61921 return std::make_pair(0U, &X86::VR64RegClass);
61922 case 'z':
61923 if (!Subtarget.hasSSE1()) break;
61924 switch (VT.SimpleTy) {
61925 default: break;
61926 // Scalar SSE types.
61927 case MVT::f16:
61928 if (!Subtarget.hasFP16())
61929 break;
61930 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61931 case MVT::f32:
61932 case MVT::i32:
61933 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61934 case MVT::f64:
61935 case MVT::i64:
61936 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61937 case MVT::v8f16:
61938 if (!Subtarget.hasFP16())
61939 break;
61940 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61941 case MVT::v8bf16:
61942 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61943 break;
61944 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61945 case MVT::f128:
61946 case MVT::v16i8:
61947 case MVT::v8i16:
61948 case MVT::v4i32:
61949 case MVT::v2i64:
61950 case MVT::v4f32:
61951 case MVT::v2f64:
61952 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61953 // AVX types.
61954 case MVT::v16f16:
61955 if (!Subtarget.hasFP16())
61956 break;
61957 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61958 case MVT::v16bf16:
61959 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61960 break;
61961 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61962 case MVT::v32i8:
61963 case MVT::v16i16:
61964 case MVT::v8i32:
61965 case MVT::v4i64:
61966 case MVT::v8f32:
61967 case MVT::v4f64:
61968 if (Subtarget.hasAVX())
61969 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61970 break;
61971 case MVT::v32f16:
61972 if (!Subtarget.hasFP16())
61973 break;
61974 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61975 case MVT::v32bf16:
61976 if (!Subtarget.hasBF16())
61977 break;
61978 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61979 case MVT::v64i8:
61980 case MVT::v32i16:
61981 case MVT::v8f64:
61982 case MVT::v16f32:
61983 case MVT::v16i32:
61984 case MVT::v8i64:
61985 if (Subtarget.hasAVX512())
61986 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61987 break;
61988 }
61989 break;
61990 case 'k':
61991 // This register class doesn't allocate k0 for masked vector operation.
61992 if (Subtarget.hasAVX512()) {
61993 if (VT == MVT::v1i1 || VT == MVT::i1)
61994 return std::make_pair(0U, &X86::VK1WMRegClass);
61995 if (VT == MVT::v8i1 || VT == MVT::i8)
61996 return std::make_pair(0U, &X86::VK8WMRegClass);
61997 if (VT == MVT::v16i1 || VT == MVT::i16)
61998 return std::make_pair(0U, &X86::VK16WMRegClass);
61999 }
62000 if (Subtarget.hasBWI()) {
62001 if (VT == MVT::v32i1 || VT == MVT::i32)
62002 return std::make_pair(0U, &X86::VK32WMRegClass);
62003 if (VT == MVT::v64i1 || VT == MVT::i64)
62004 return std::make_pair(0U, &X86::VK64WMRegClass);
62005 }
62006 break;
62007 }
62008 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
62009 switch (Constraint[1]) {
62010 default:
62011 break;
62012 case 'r':
62013 if (VT == MVT::i8 || VT == MVT::i1)
62014 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
62015 if (VT == MVT::i16)
62016 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
62017 if (VT == MVT::i32 || VT == MVT::f32)
62018 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
62019 if (VT != MVT::f80 && !VT.isVector())
62020 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
62021 break;
62022 case 'R':
62023 if (VT == MVT::i8 || VT == MVT::i1)
62024 return std::make_pair(0U, &X86::GR8RegClass);
62025 if (VT == MVT::i16)
62026 return std::make_pair(0U, &X86::GR16RegClass);
62027 if (VT == MVT::i32 || VT == MVT::f32)
62028 return std::make_pair(0U, &X86::GR32RegClass);
62029 if (VT != MVT::f80 && !VT.isVector())
62030 return std::make_pair(0U, &X86::GR64RegClass);
62031 break;
62032 }
62033 }
62034
62035 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
62036 return std::make_pair(0U, &X86::GR32RegClass);
62037
62038 // Use the default implementation in TargetLowering to convert the register
62039 // constraint into a member of a register class.
62040 std::pair<Register, const TargetRegisterClass*> Res;
62042
62043 // Not found as a standard register?
62044 if (!Res.second) {
62045 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62046 // to/from f80.
62047 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62048 // Map st(0) -> st(7) -> ST0
62049 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62050 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62051 Constraint[3] == '(' &&
62052 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62053 Constraint[5] == ')' && Constraint[6] == '}') {
62054 // st(7) is not allocatable and thus not a member of RFP80. Return
62055 // singleton class in cases where we have a reference to it.
62056 if (Constraint[4] == '7')
62057 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62058 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62059 &X86::RFP80RegClass);
62060 }
62061
62062 // GCC allows "st(0)" to be called just plain "st".
62063 if (StringRef("{st}").equals_insensitive(Constraint))
62064 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62065 }
62066
62067 // flags -> EFLAGS
62068 if (StringRef("{flags}").equals_insensitive(Constraint))
62069 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62070
62071 // dirflag -> DF
62072 // Only allow for clobber.
62073 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62074 VT == MVT::Other)
62075 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62076
62077 // fpsr -> FPSW
62078 // Only allow for clobber.
62079 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62080 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62081
62082 return Res;
62083 }
62084
62085 // Make sure it isn't a register that requires 64-bit mode.
62086 if (!Subtarget.is64Bit() &&
62087 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62088 TRI->getEncodingValue(Res.first) >= 8) {
62089 // Register requires REX prefix, but we're in 32-bit mode.
62090 return std::make_pair(0, nullptr);
62091 }
62092
62093 // Make sure it isn't a register that requires AVX512.
62094 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62095 TRI->getEncodingValue(Res.first) & 0x10) {
62096 // Register requires EVEX prefix.
62097 return std::make_pair(0, nullptr);
62098 }
62099
62100 // Otherwise, check to see if this is a register class of the wrong value
62101 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62102 // turn into {ax},{dx}.
62103 // MVT::Other is used to specify clobber names.
62104 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62105 return Res; // Correct type already, nothing to do.
62106
62107 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62108 // return "eax". This should even work for things like getting 64bit integer
62109 // registers when given an f64 type.
62110 const TargetRegisterClass *Class = Res.second;
62111 // The generic code will match the first register class that contains the
62112 // given register. Thus, based on the ordering of the tablegened file,
62113 // the "plain" GR classes might not come first.
62114 // Therefore, use a helper method.
62115 if (isGRClass(*Class)) {
62116 unsigned Size = VT.getSizeInBits();
62117 if (Size == 1) Size = 8;
62118 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62119 return std::make_pair(0, nullptr);
62120 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62121 if (DestReg.isValid()) {
62122 bool is64Bit = Subtarget.is64Bit();
62123 const TargetRegisterClass *RC =
62124 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62125 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62126 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62127 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62128 if (Size == 64 && !is64Bit) {
62129 // Model GCC's behavior here and select a fixed pair of 32-bit
62130 // registers.
62131 switch (DestReg) {
62132 case X86::RAX:
62133 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62134 case X86::RDX:
62135 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62136 case X86::RCX:
62137 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62138 case X86::RBX:
62139 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62140 case X86::RSI:
62141 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62142 case X86::RDI:
62143 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62144 case X86::RBP:
62145 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62146 default:
62147 return std::make_pair(0, nullptr);
62148 }
62149 }
62150 if (RC && RC->contains(DestReg))
62151 return std::make_pair(DestReg, RC);
62152 return Res;
62153 }
62154 // No register found/type mismatch.
62155 return std::make_pair(0, nullptr);
62156 } else if (isFRClass(*Class)) {
62157 // Handle references to XMM physical registers that got mapped into the
62158 // wrong class. This can happen with constraints like {xmm0} where the
62159 // target independent register mapper will just pick the first match it can
62160 // find, ignoring the required type.
62161
62162 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62163 if (VT == MVT::f16)
62164 Res.second = &X86::FR16XRegClass;
62165 else if (VT == MVT::f32 || VT == MVT::i32)
62166 Res.second = &X86::FR32XRegClass;
62167 else if (VT == MVT::f64 || VT == MVT::i64)
62168 Res.second = &X86::FR64XRegClass;
62169 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62170 Res.second = &X86::VR128XRegClass;
62171 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62172 Res.second = &X86::VR256XRegClass;
62173 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62174 Res.second = &X86::VR512RegClass;
62175 else {
62176 // Type mismatch and not a clobber: Return an error;
62177 Res.first = 0;
62178 Res.second = nullptr;
62179 }
62180 } else if (isVKClass(*Class)) {
62181 if (VT == MVT::v1i1 || VT == MVT::i1)
62182 Res.second = &X86::VK1RegClass;
62183 else if (VT == MVT::v8i1 || VT == MVT::i8)
62184 Res.second = &X86::VK8RegClass;
62185 else if (VT == MVT::v16i1 || VT == MVT::i16)
62186 Res.second = &X86::VK16RegClass;
62187 else if (VT == MVT::v32i1 || VT == MVT::i32)
62188 Res.second = &X86::VK32RegClass;
62189 else if (VT == MVT::v64i1 || VT == MVT::i64)
62190 Res.second = &X86::VK64RegClass;
62191 else {
62192 // Type mismatch and not a clobber: Return an error;
62193 Res.first = 0;
62194 Res.second = nullptr;
62195 }
62196 }
62197
62198 return Res;
62199}
62200
62201bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62202 // Integer division on x86 is expensive. However, when aggressively optimizing
62203 // for code size, we prefer to use a div instruction, as it is usually smaller
62204 // than the alternative sequence.
62205 // The exception to this is vector division. Since x86 doesn't have vector
62206 // integer division, leaving the division as-is is a loss even in terms of
62207 // size, because it will have to be scalarized, while the alternative code
62208 // sequence can be performed in vector form.
62209 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62210 return OptSize && !VT.isVector();
62211}
62212
62213void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62214 if (!Subtarget.is64Bit())
62215 return;
62216
62217 // Update IsSplitCSR in X86MachineFunctionInfo.
62219 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62220 AFI->setIsSplitCSR(true);
62221}
62222
62223void X86TargetLowering::insertCopiesSplitCSR(
62224 MachineBasicBlock *Entry,
62225 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62226 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62227 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62228 if (!IStart)
62229 return;
62230
62231 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62232 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62233 MachineBasicBlock::iterator MBBI = Entry->begin();
62234 for (const MCPhysReg *I = IStart; *I; ++I) {
62235 const TargetRegisterClass *RC = nullptr;
62236 if (X86::GR64RegClass.contains(*I))
62237 RC = &X86::GR64RegClass;
62238 else
62239 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62240
62241 Register NewVR = MRI->createVirtualRegister(RC);
62242 // Create copy from CSR to a virtual register.
62243 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62244 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62245 // nounwind. If we want to generalize this later, we may need to emit
62246 // CFI pseudo-instructions.
62247 assert(
62248 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62249 "Function should be nounwind in insertCopiesSplitCSR!");
62250 Entry->addLiveIn(*I);
62251 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62252 .addReg(*I);
62253
62254 // Insert the copy-back instructions right before the terminator.
62255 for (auto *Exit : Exits)
62256 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62257 TII->get(TargetOpcode::COPY), *I)
62258 .addReg(NewVR);
62259 }
62260}
62261
62263 return Subtarget.is64Bit();
62264}
62265
62269 const TargetInstrInfo *TII) const {
62270 assert(MBBI->isCall() && MBBI->getCFIType() &&
62271 "Invalid call instruction for a KCFI check");
62272
62273 MachineFunction &MF = *MBB.getParent();
62274 // If the call target is a memory operand, unfold it and use R11 for the
62275 // call, so KCFI_CHECK won't have to recompute the address.
62276 switch (MBBI->getOpcode()) {
62277 case X86::CALL64m:
62278 case X86::CALL64m_NT:
62279 case X86::TAILJMPm64:
62280 case X86::TAILJMPm64_REX: {
62283 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62284 /*UnfoldStore=*/false, NewMIs))
62285 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62286 for (auto *NewMI : NewMIs)
62287 MBBI = MBB.insert(OrigCall, NewMI);
62288 assert(MBBI->isCall() &&
62289 "Unexpected instruction after memory operand unfolding");
62290 if (OrigCall->shouldUpdateAdditionalCallInfo())
62291 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62292 MBBI->setCFIType(MF, OrigCall->getCFIType());
62293 OrigCall->eraseFromParent();
62294 break;
62295 }
62296 default:
62297 break;
62298 }
62299
62300 MachineOperand &Target = MBBI->getOperand(0);
62301 Register TargetReg;
62302 switch (MBBI->getOpcode()) {
62303 case X86::CALL64r:
62304 case X86::CALL64r_ImpCall:
62305 case X86::CALL64r_NT:
62306 case X86::TAILJMPr64:
62307 case X86::TAILJMPr64_REX:
62308 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62309 Target.setIsRenamable(false);
62310 TargetReg = Target.getReg();
62311 break;
62312 case X86::CALL64pcrel32:
62313 case X86::TAILJMPd64:
62314 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62315 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62316 // 64-bit indirect thunk calls.
62317 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62318 "Unexpected register for an indirect thunk call");
62319 TargetReg = X86::R11;
62320 break;
62321 default:
62322 llvm_unreachable("Unexpected CFI call opcode");
62323 break;
62324 }
62325
62326 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62327 .addReg(TargetReg)
62328 .addImm(MBBI->getCFIType())
62329 .getInstr();
62330}
62331
62332/// Returns true if stack probing through a function call is requested.
62336
62337/// Returns true if stack probing through inline assembly is requested.
62339
62340 // No inline stack probe for Windows, they have their own mechanism.
62341 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62342 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62343 return false;
62344
62345 // If the function specifically requests inline stack probes, emit them.
62346 if (MF.getFunction().hasFnAttribute("probe-stack"))
62347 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62348 "inline-asm";
62349
62350 return false;
62351}
62352
62353/// Returns the name of the symbol used to emit stack probes or the empty
62354/// string if not applicable.
62357 // Inline Stack probes disable stack probe call
62358 if (hasInlineStackProbe(MF))
62359 return "";
62360
62361 // If the function specifically requests stack probes, emit them.
62362 if (MF.getFunction().hasFnAttribute("probe-stack"))
62363 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62364
62365 // Generally, if we aren't on Windows, the platform ABI does not include
62366 // support for stack probes, so don't emit them.
62367 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62368 Subtarget.isTargetMachO() ||
62369 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62370 return "";
62371
62372 // We need a stack probe to conform to the Windows ABI. Choose the right
62373 // symbol.
62374 if (Subtarget.is64Bit())
62375 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62376 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62377}
62378
62379unsigned
62381 // The default stack probe size is 4096 if the function has no stackprobesize
62382 // attribute.
62383 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62384 4096);
62385}
62386
62388 if (ML && ML->isInnermost() &&
62389 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62392}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.