LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true,
4456 bool AllowAVX512 = true) {
4457 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4458 unsigned NumSubs = 1;
4459 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4460 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4461 if (VT.getSizeInBits() > 512) {
4462 NumSubs = VT.getSizeInBits() / 512;
4463 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4464 }
4465 } else if (Subtarget.hasAVX2()) {
4466 if (VT.getSizeInBits() > 256) {
4467 NumSubs = VT.getSizeInBits() / 256;
4468 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4469 }
4470 } else {
4471 if (VT.getSizeInBits() > 128) {
4472 NumSubs = VT.getSizeInBits() / 128;
4473 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4474 }
4475 }
4476
4477 if (NumSubs == 1)
4478 return Builder(DAG, DL, Ops);
4479
4481 for (unsigned i = 0; i != NumSubs; ++i) {
4483 for (SDValue Op : Ops) {
4484 EVT OpVT = Op.getValueType();
4485 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4486 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4487 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4488 }
4489 Subs.push_back(Builder(DAG, DL, SubOps));
4490 }
4491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4492}
4493
4494// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4495// targets.
4496static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4498 const X86Subtarget &Subtarget) {
4499 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4500 MVT SVT = VT.getScalarType();
4501
4502 // If we have a 32/64 splatted constant, splat it to DstTy to
4503 // encourage a foldable broadcast'd operand.
4504 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4505 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4506 // AVX512 broadcasts 32/64-bit operands.
4507 // TODO: Support float once getAVX512Node is used by fp-ops.
4508 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4510 return SDValue();
4511 // If we're not widening, don't bother if we're not bitcasting.
4512 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4513 return SDValue();
4515 APInt SplatValue, SplatUndef;
4516 unsigned SplatBitSize;
4517 bool HasAnyUndefs;
4518 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4519 HasAnyUndefs, OpEltSizeInBits) &&
4520 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4521 return DAG.getConstant(SplatValue, DL, DstVT);
4522 }
4523 return SDValue();
4524 };
4525
4526 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4527
4528 MVT DstVT = VT;
4529 if (Widen)
4530 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4531
4532 // Canonicalize src operands.
4533 SmallVector<SDValue> SrcOps(Ops);
4534 for (SDValue &Op : SrcOps) {
4535 MVT OpVT = Op.getSimpleValueType();
4536 // Just pass through scalar operands.
4537 if (!OpVT.isVector())
4538 continue;
4539 assert(OpVT == VT && "Vector type mismatch");
4540
4541 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4542 Op = BroadcastOp;
4543 continue;
4544 }
4545
4546 // Just widen the subvector by inserting into an undef wide vector.
4547 if (Widen)
4548 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4549 }
4550
4551 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4552
4553 // Perform the 512-bit op then extract the bottom subvector.
4554 if (Widen)
4555 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4556 return Res;
4557}
4558
4559/// Insert i1-subvector to i1-vector.
4561 const X86Subtarget &Subtarget) {
4562
4563 SDLoc dl(Op);
4564 SDValue Vec = Op.getOperand(0);
4565 SDValue SubVec = Op.getOperand(1);
4566 SDValue Idx = Op.getOperand(2);
4567 unsigned IdxVal = Op.getConstantOperandVal(2);
4568
4569 // Inserting undef is a nop. We can just return the original vector.
4570 if (SubVec.isUndef())
4571 return Vec;
4572
4573 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4574 return Op;
4575
4576 MVT OpVT = Op.getSimpleValueType();
4577 unsigned NumElems = OpVT.getVectorNumElements();
4578 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4579
4580 // Extend to natively supported kshift.
4581 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4582
4583 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4584 // if necessary.
4585 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4586 // May need to promote to a legal type.
4587 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4588 DAG.getConstant(0, dl, WideOpVT),
4589 SubVec, Idx);
4590 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4591 }
4592
4593 MVT SubVecVT = SubVec.getSimpleValueType();
4594 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4595 assert(IdxVal + SubVecNumElems <= NumElems &&
4596 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4597 "Unexpected index value in INSERT_SUBVECTOR");
4598
4599 SDValue Undef = DAG.getUNDEF(WideOpVT);
4600
4601 if (IdxVal == 0) {
4602 // Zero lower bits of the Vec
4603 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4605 ZeroIdx);
4606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4607 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4608 // Merge them together, SubVec should be zero extended.
4609 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4610 DAG.getConstant(0, dl, WideOpVT),
4611 SubVec, ZeroIdx);
4612 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4614 }
4615
4616 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4617 Undef, SubVec, ZeroIdx);
4618
4619 if (Vec.isUndef()) {
4620 assert(IdxVal != 0 && "Unexpected index");
4621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4624 }
4625
4627 assert(IdxVal != 0 && "Unexpected index");
4628 // If upper elements of Vec are known undef, then just shift into place.
4629 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4630 [](SDValue V) { return V.isUndef(); })) {
4631 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4633 } else {
4634 NumElems = WideOpVT.getVectorNumElements();
4635 unsigned ShiftLeft = NumElems - SubVecNumElems;
4636 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4639 if (ShiftRight != 0)
4640 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4642 }
4643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4644 }
4645
4646 // Simple case when we put subvector in the upper part
4647 if (IdxVal + SubVecNumElems == NumElems) {
4648 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4649 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4650 if (SubVecNumElems * 2 == NumElems) {
4651 // Special case, use legal zero extending insert_subvector. This allows
4652 // isel to optimize when bits are known zero.
4653 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4654 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4655 DAG.getConstant(0, dl, WideOpVT),
4656 Vec, ZeroIdx);
4657 } else {
4658 // Otherwise use explicit shifts to zero the bits.
4659 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4660 Undef, Vec, ZeroIdx);
4661 NumElems = WideOpVT.getVectorNumElements();
4662 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4663 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4664 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4665 }
4666 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4668 }
4669
4670 // Inserting into the middle is more complicated.
4671
4672 NumElems = WideOpVT.getVectorNumElements();
4673
4674 // Widen the vector if needed.
4675 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4676
4677 unsigned ShiftLeft = NumElems - SubVecNumElems;
4678 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4679
4680 // Do an optimization for the most frequently used types.
4681 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4682 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4683 Mask0.flipAllBits();
4684 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4685 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4686 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4687 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4688 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4689 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4690 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4691 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4692
4693 // Reduce to original width if needed.
4694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4695 }
4696
4697 // Clear the upper bits of the subvector and move it to its insert position.
4698 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4699 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4700 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4701 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4702
4703 // Isolate the bits below the insertion point.
4704 unsigned LowShift = NumElems - IdxVal;
4705 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4706 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4707 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4708 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4709
4710 // Isolate the bits after the last inserted bit.
4711 unsigned HighShift = IdxVal + SubVecNumElems;
4712 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4713 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4714 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4715 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4716
4717 // Now OR all 3 pieces together.
4718 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4719 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4720
4721 // Reduce to original width if needed.
4722 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4723}
4724
4726 const SDLoc &dl) {
4727 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4728 EVT SubVT = V1.getValueType();
4729 EVT SubSVT = SubVT.getScalarType();
4730 unsigned SubNumElts = SubVT.getVectorNumElements();
4731 unsigned SubVectorWidth = SubVT.getSizeInBits();
4732 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4733 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4734 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4735}
4736
4737/// Returns a vector of specified type with all bits set.
4738/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4739/// Then bitcast to their original type, ensuring they get CSE'd.
4740static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4741 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4742 "Expected a 128/256/512-bit vector type");
4743 unsigned NumElts = VT.getSizeInBits() / 32;
4744 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4745 return DAG.getBitcast(VT, Vec);
4746}
4747
4748static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4749 SDValue In, SelectionDAG &DAG) {
4750 EVT InVT = In.getValueType();
4751 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4752
4753 // Canonicalize Opcode to general extension version.
4754 switch (Opcode) {
4755 case ISD::ANY_EXTEND:
4757 Opcode = ISD::ANY_EXTEND;
4758 break;
4759 case ISD::SIGN_EXTEND:
4761 Opcode = ISD::SIGN_EXTEND;
4762 break;
4763 case ISD::ZERO_EXTEND:
4765 Opcode = ISD::ZERO_EXTEND;
4766 break;
4767 default:
4768 llvm_unreachable("Unknown extension opcode");
4769 }
4770
4771 // For 256-bit vectors, we only need the lower (128-bit) input half.
4772 // For 512-bit vectors, we only need the lower input half or quarter.
4773 if (InVT.getSizeInBits() > 128) {
4774 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4775 "Expected VTs to be the same size!");
4776 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4777 In = extractSubVector(In, 0, DAG, DL,
4778 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4779 InVT = In.getValueType();
4780 }
4781
4782 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4783 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4784
4785 return DAG.getNode(Opcode, DL, VT, In);
4786}
4787
4788// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4790 SDValue Mask, SelectionDAG &DAG) {
4791 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4792 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4793 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4794}
4795
4797 bool Lo, bool Unary) {
4798 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4799 "Illegal vector type to unpack");
4800 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4801 int NumElts = VT.getVectorNumElements();
4802 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4803 for (int i = 0; i < NumElts; ++i) {
4804 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4805 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4806 Pos += (Unary ? 0 : NumElts * (i % 2));
4807 Pos += (Lo ? 0 : NumEltsInLane / 2);
4808 Mask.push_back(Pos);
4809 }
4810}
4811
4812/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4813/// imposed by AVX and specific to the unary pattern. Example:
4814/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4815/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4817 bool Lo) {
4818 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4819 int NumElts = VT.getVectorNumElements();
4820 for (int i = 0; i < NumElts; ++i) {
4821 int Pos = i / 2;
4822 Pos += (Lo ? 0 : NumElts / 2);
4823 Mask.push_back(Pos);
4824 }
4825}
4826
4827// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4828static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4829 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4832 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4833 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4834 int M = Mask[I];
4835 if (M < 0)
4836 continue;
4837 SDValue V = (M < NumElts) ? V1 : V2;
4838 if (V.isUndef())
4839 continue;
4840 Ops[I] = V.getOperand(M % NumElts);
4841 }
4842 return DAG.getBuildVector(VT, dl, Ops);
4843 }
4844
4845 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4846}
4847
4848/// Returns a vector_shuffle node for an unpackl operation.
4849static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4850 SDValue V1, SDValue V2) {
4852 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4853 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4854}
4855
4856/// Returns a vector_shuffle node for an unpackh operation.
4857static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4858 SDValue V1, SDValue V2) {
4860 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4861 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4862}
4863
4864/// Returns a node that packs the LHS + RHS nodes together at half width.
4865/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4866/// TODO: Add subvector splitting if/when we have a need for it.
4867static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4868 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4869 bool PackHiHalf = false) {
4870 MVT OpVT = LHS.getSimpleValueType();
4871 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4872 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4873 assert(OpVT == RHS.getSimpleValueType() &&
4874 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4875 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4876 "Unexpected PACK operand types");
4877 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4878 "Unexpected PACK result type");
4879
4880 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4881 if (EltSizeInBits == 32) {
4882 SmallVector<int> PackMask;
4883 int Offset = PackHiHalf ? 1 : 0;
4884 int NumElts = VT.getVectorNumElements();
4885 for (int I = 0; I != NumElts; I += 4) {
4886 PackMask.push_back(I + Offset);
4887 PackMask.push_back(I + Offset + 2);
4888 PackMask.push_back(I + Offset + NumElts);
4889 PackMask.push_back(I + Offset + NumElts + 2);
4890 }
4891 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4892 DAG.getBitcast(VT, RHS), PackMask);
4893 }
4894
4895 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4896 if (!PackHiHalf) {
4897 if (UsePackUS &&
4898 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4899 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4901
4902 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4903 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4904 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4905 }
4906
4907 // Fallback to sign/zero extending the requested half and pack.
4908 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4909 if (UsePackUS) {
4910 if (PackHiHalf) {
4911 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4912 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4913 } else {
4914 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4915 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4916 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4917 };
4918 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4919 };
4920
4921 if (!PackHiHalf) {
4922 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4924 }
4925 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4927 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4928}
4929
4930/// Return a vector_shuffle of the specified vector of zero or undef vector.
4931/// This produces a shuffle where the low element of V2 is swizzled into the
4932/// zero/undef vector, landing at element Idx.
4933/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4935 bool IsZero,
4936 const X86Subtarget &Subtarget,
4937 SelectionDAG &DAG) {
4938 MVT VT = V2.getSimpleValueType();
4939 SDValue V1 = IsZero
4940 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4941 int NumElems = VT.getVectorNumElements();
4942 SmallVector<int, 16> MaskVec(NumElems);
4943 for (int i = 0; i != NumElems; ++i)
4944 // If this is the insertion idx, put the low elt of V2 here.
4945 MaskVec[i] = (i == Idx) ? NumElems : i;
4946 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4947}
4948
4950 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4951 Ptr.getOpcode() == X86ISD::WrapperRIP)
4952 Ptr = Ptr.getOperand(0);
4954}
4955
4956// TODO: Add support for non-zero offsets.
4959 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4960 return nullptr;
4961 return CNode->getConstVal();
4962}
4963
4965 if (!Load || !ISD::isNormalLoad(Load))
4966 return nullptr;
4967 return getTargetConstantFromBasePtr(Load->getBasePtr());
4968}
4969
4974
4975const Constant *
4977 assert(LD && "Unexpected null LoadSDNode");
4978 return getTargetConstantFromNode(LD);
4979}
4980
4982 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4983 SDValue Cond = N->getOperand(0);
4984 SDValue RHS = N->getOperand(2);
4985 EVT CondVT = Cond.getValueType();
4986 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4987 CondVT.getVectorElementType() == MVT::i1 &&
4988 ISD::isBuildVectorAllZeros(RHS.getNode());
4989}
4990
4991// Extract raw constant bits from constant pools.
4992static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4993 APInt &UndefElts,
4994 SmallVectorImpl<APInt> &EltBits,
4995 bool AllowWholeUndefs = true,
4996 bool AllowPartialUndefs = false) {
4997 assert(EltBits.empty() && "Expected an empty EltBits vector");
4998
5000
5001 EVT VT = Op.getValueType();
5002 unsigned SizeInBits = VT.getSizeInBits();
5003 unsigned NumElts = SizeInBits / EltSizeInBits;
5004
5005 // Can't split constant.
5006 if ((SizeInBits % EltSizeInBits) != 0)
5007 return false;
5008
5009 // Bitcast a source array of element bits to the target size.
5010 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5011 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5012 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5013 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5014 "Constant bit sizes don't match");
5015
5016 // Don't split if we don't allow undef bits.
5017 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5018 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5019 return false;
5020
5021 // If we're already the right size, don't bother bitcasting.
5022 if (NumSrcElts == NumElts) {
5023 UndefElts = UndefSrcElts;
5024 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5025 return true;
5026 }
5027
5028 // Extract all the undef/constant element data and pack into single bitsets.
5029 APInt UndefBits(SizeInBits, 0);
5030 APInt MaskBits(SizeInBits, 0);
5031
5032 for (unsigned i = 0; i != NumSrcElts; ++i) {
5033 unsigned BitOffset = i * SrcEltSizeInBits;
5034 if (UndefSrcElts[i])
5035 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5036 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5037 }
5038
5039 // Split the undef/constant single bitset data into the target elements.
5040 UndefElts = APInt(NumElts, 0);
5041 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5042
5043 for (unsigned i = 0; i != NumElts; ++i) {
5044 unsigned BitOffset = i * EltSizeInBits;
5045 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5046
5047 // Only treat an element as UNDEF if all bits are UNDEF.
5048 if (UndefEltBits.isAllOnes()) {
5049 if (!AllowWholeUndefs)
5050 return false;
5051 UndefElts.setBit(i);
5052 continue;
5053 }
5054
5055 // If only some bits are UNDEF then treat them as zero (or bail if not
5056 // supported).
5057 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5058 return false;
5059
5060 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5061 }
5062 return true;
5063 };
5064
5065 // Collect constant bits and insert into mask/undef bit masks.
5066 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5067 unsigned UndefBitIndex) {
5068 if (!Cst)
5069 return false;
5070 if (isa<UndefValue>(Cst)) {
5071 Undefs.setBit(UndefBitIndex);
5072 return true;
5073 }
5074 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5075 Mask = CInt->getValue();
5076 return true;
5077 }
5078 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5079 Mask = CFP->getValueAPF().bitcastToAPInt();
5080 return true;
5081 }
5082 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5083 Type *Ty = CDS->getType();
5084 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5085 Type *EltTy = CDS->getElementType();
5086 bool IsInteger = EltTy->isIntegerTy();
5087 bool IsFP =
5088 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5089 if (!IsInteger && !IsFP)
5090 return false;
5091 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5092 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5093 if (IsInteger)
5094 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5095 else
5096 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5097 I * EltBits);
5098 return true;
5099 }
5100 return false;
5101 };
5102
5103 // Handle UNDEFs.
5104 if (Op.isUndef()) {
5105 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5106 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5107 return CastBitData(UndefSrcElts, SrcEltBits);
5108 }
5109
5110 // Extract scalar constant bits.
5111 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5112 APInt UndefSrcElts = APInt::getZero(1);
5113 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5114 return CastBitData(UndefSrcElts, SrcEltBits);
5115 }
5116 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5117 APInt UndefSrcElts = APInt::getZero(1);
5118 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5119 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122
5123 // Extract constant bits from build vector.
5124 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5125 BitVector Undefs;
5126 SmallVector<APInt> SrcEltBits;
5127 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5128 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5129 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5130 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5131 if (Undefs[I])
5132 UndefSrcElts.setBit(I);
5133 return CastBitData(UndefSrcElts, SrcEltBits);
5134 }
5135 }
5136
5137 // Extract constant bits from constant pool vector.
5138 if (auto *Cst = getTargetConstantFromNode(Op)) {
5139 Type *CstTy = Cst->getType();
5140 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5141 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5142 return false;
5143
5144 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5145 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5146 if ((SizeInBits % SrcEltSizeInBits) != 0)
5147 return false;
5148
5149 APInt UndefSrcElts(NumSrcElts, 0);
5150 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5151 for (unsigned i = 0; i != NumSrcElts; ++i)
5152 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5153 UndefSrcElts, i))
5154 return false;
5155
5156 return CastBitData(UndefSrcElts, SrcEltBits);
5157 }
5158
5159 // Extract constant bits from a broadcasted constant pool scalar.
5160 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5161 EltSizeInBits <= VT.getScalarSizeInBits()) {
5162 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5163 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5164 return false;
5165
5166 SDValue Ptr = MemIntr->getBasePtr();
5168 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5169 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5170
5171 APInt UndefSrcElts(NumSrcElts, 0);
5172 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5173 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5174 if (UndefSrcElts[0])
5175 UndefSrcElts.setBits(0, NumSrcElts);
5176 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5177 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5178 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5179 return CastBitData(UndefSrcElts, SrcEltBits);
5180 }
5181 }
5182 }
5183
5184 // Extract constant bits from a subvector broadcast.
5185 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5186 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5187 SDValue Ptr = MemIntr->getBasePtr();
5188 // The source constant may be larger than the subvector broadcast,
5189 // ensure we extract the correct subvector constants.
5190 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5191 Type *CstTy = Cst->getType();
5192 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5193 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5194 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5195 (SizeInBits % SubVecSizeInBits) != 0)
5196 return false;
5197 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5198 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5199 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5200 APInt UndefSubElts(NumSubElts, 0);
5201 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5202 APInt(CstEltSizeInBits, 0));
5203 for (unsigned i = 0; i != NumSubElts; ++i) {
5204 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5205 UndefSubElts, i))
5206 return false;
5207 for (unsigned j = 1; j != NumSubVecs; ++j)
5208 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5209 }
5210 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5211 UndefSubElts);
5212 return CastBitData(UndefSubElts, SubEltBits);
5213 }
5214 }
5215
5216 // Extract a rematerialized scalar constant insertion.
5217 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5218 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5219 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5220 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222
5223 APInt UndefSrcElts(NumSrcElts, 0);
5224 SmallVector<APInt, 64> SrcEltBits;
5225 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5226 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5227 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5228 return CastBitData(UndefSrcElts, SrcEltBits);
5229 }
5230
5231 // Insert constant bits from a base and sub vector sources.
5232 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5233 // If bitcasts to larger elements we might lose track of undefs - don't
5234 // allow any to be safe.
5235 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5236 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5237
5238 APInt UndefSrcElts, UndefSubElts;
5239 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5240 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5241 UndefSubElts, EltSubBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs) &&
5244 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5245 UndefSrcElts, EltSrcBits,
5246 AllowWholeUndefs && AllowUndefs,
5247 AllowPartialUndefs && AllowUndefs)) {
5248 unsigned BaseIdx = Op.getConstantOperandVal(2);
5249 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5250 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5251 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5252 return CastBitData(UndefSrcElts, EltSrcBits);
5253 }
5254 }
5255
5256 // Extract constant bits from a subvector's source.
5257 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5258 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5259 EltBits, AllowWholeUndefs,
5260 AllowPartialUndefs)) {
5261 EVT SrcVT = Op.getOperand(0).getValueType();
5262 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5263 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5264 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5265 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5266 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5268 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5269
5270 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5271 if ((BaseIdx + NumSubElts) != NumSrcElts)
5272 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5273 if (BaseIdx != 0)
5274 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5275 return true;
5276 }
5277
5278 // Extract constant bits from shuffle node sources.
5279 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5280 // TODO - support shuffle through bitcasts.
5281 if (EltSizeInBits != VT.getScalarSizeInBits())
5282 return false;
5283
5284 ArrayRef<int> Mask = SVN->getMask();
5285 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5286 llvm::any_of(Mask, [](int M) { return M < 0; }))
5287 return false;
5288
5289 APInt UndefElts0, UndefElts1;
5290 SmallVector<APInt, 32> EltBits0, EltBits1;
5291 if (isAnyInRange(Mask, 0, NumElts) &&
5292 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5293 UndefElts0, EltBits0, AllowWholeUndefs,
5294 AllowPartialUndefs))
5295 return false;
5296 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5297 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5298 UndefElts1, EltBits1, AllowWholeUndefs,
5299 AllowPartialUndefs))
5300 return false;
5301
5302 UndefElts = APInt::getZero(NumElts);
5303 for (int i = 0; i != (int)NumElts; ++i) {
5304 int M = Mask[i];
5305 if (M < 0) {
5306 UndefElts.setBit(i);
5307 EltBits.push_back(APInt::getZero(EltSizeInBits));
5308 } else if (M < (int)NumElts) {
5309 if (UndefElts0[M])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits0[M]);
5312 } else {
5313 if (UndefElts1[M - NumElts])
5314 UndefElts.setBit(i);
5315 EltBits.push_back(EltBits1[M - NumElts]);
5316 }
5317 }
5318 return true;
5319 }
5320
5321 return false;
5322}
5323
5324namespace llvm {
5325namespace X86 {
5326bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5327 APInt UndefElts;
5328 SmallVector<APInt, 16> EltBits;
5330 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5331 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5332 int SplatIndex = -1;
5333 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5334 if (UndefElts[i])
5335 continue;
5336 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5337 SplatIndex = -1;
5338 break;
5339 }
5340 SplatIndex = i;
5341 }
5342 if (0 <= SplatIndex) {
5343 SplatVal = EltBits[SplatIndex];
5344 return true;
5345 }
5346 }
5347
5348 return false;
5349}
5350
5351int getRoundingModeX86(unsigned RM) {
5352 switch (static_cast<::llvm::RoundingMode>(RM)) {
5353 // clang-format off
5354 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5355 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5356 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5357 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5358 default:
5359 return X86::rmInvalid; // Invalid rounding mode
5360 }
5361}
5362
5363} // namespace X86
5364} // namespace llvm
5365
5367 unsigned MaskEltSizeInBits,
5369 APInt &UndefElts) {
5370 // Extract the raw target constant bits.
5371 SmallVector<APInt, 64> EltBits;
5372 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5373 EltBits, /* AllowWholeUndefs */ true,
5374 /* AllowPartialUndefs */ false))
5375 return false;
5376
5377 // Insert the extracted elements into the mask.
5378 for (const APInt &Elt : EltBits)
5379 RawMask.push_back(Elt.getZExtValue());
5380
5381 return true;
5382}
5383
5384static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5385 bool AllowUndefs) {
5386 APInt UndefElts;
5387 SmallVector<APInt, 64> EltBits;
5388 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5389 /*AllowWholeUndefs*/ AllowUndefs,
5390 /*AllowPartialUndefs*/ false))
5391 return false;
5392
5393 bool IsPow2OrUndef = true;
5394 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5395 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5396 return IsPow2OrUndef;
5397}
5398
5399// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5401 // TODO: don't always ignore oneuse constraints.
5402 V = peekThroughBitcasts(V);
5403 EVT VT = V.getValueType();
5404
5405 // Match not(xor X, -1) -> X.
5406 if (V.getOpcode() == ISD::XOR &&
5407 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5408 isAllOnesConstant(V.getOperand(1))))
5409 return V.getOperand(0);
5410
5411 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5412 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5413 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5414 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5415 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5417 V.getOperand(1));
5418 }
5419 }
5420
5421 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5422 if (V.getOpcode() == X86ISD::PCMPGT &&
5423 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5424 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5425 V.getOperand(0).hasOneUse()) {
5426 APInt UndefElts;
5427 SmallVector<APInt> EltBits;
5428 if (getTargetConstantBitsFromNode(V.getOperand(0),
5429 V.getScalarValueSizeInBits(), UndefElts,
5430 EltBits) &&
5431 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5432 // Don't fold min_signed_value -> (min_signed_value - 1)
5433 bool MinSigned = false;
5434 for (APInt &Elt : EltBits) {
5435 MinSigned |= Elt.isMinSignedValue();
5436 Elt -= 1;
5437 }
5438 if (!MinSigned) {
5439 SDLoc DL(V);
5440 MVT VT = V.getSimpleValueType();
5441 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5442 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5443 }
5444 }
5445 }
5446
5447 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5449 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5450 for (SDValue &CatOp : CatOps) {
5451 SDValue NotCat = IsNOT(CatOp, DAG);
5452 if (!NotCat)
5453 return SDValue();
5454 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5455 }
5456 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5457 }
5458
5459 // Match not(or(not(X),not(Y))) -> and(X, Y).
5460 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5461 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5462 // TODO: Handle cases with single NOT operand -> ANDNP
5463 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5464 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5465 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5466 DAG.getBitcast(VT, Op1));
5467 }
5468
5469 return SDValue();
5470}
5471
5472/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5473/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5474/// Note: This ignores saturation, so inputs must be checked first.
5476 bool Unary, unsigned NumStages = 1) {
5477 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5478 unsigned NumElts = VT.getVectorNumElements();
5479 unsigned NumLanes = VT.getSizeInBits() / 128;
5480 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5481 unsigned Offset = Unary ? 0 : NumElts;
5482 unsigned Repetitions = 1u << (NumStages - 1);
5483 unsigned Increment = 1u << NumStages;
5484 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5485
5486 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5487 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5488 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5489 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5490 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5491 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5492 }
5493 }
5494}
5495
5496// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5497static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5498 APInt &DemandedLHS, APInt &DemandedRHS) {
5499 int NumLanes = VT.getSizeInBits() / 128;
5500 int NumElts = DemandedElts.getBitWidth();
5501 int NumInnerElts = NumElts / 2;
5502 int NumEltsPerLane = NumElts / NumLanes;
5503 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5504
5505 DemandedLHS = APInt::getZero(NumInnerElts);
5506 DemandedRHS = APInt::getZero(NumInnerElts);
5507
5508 // Map DemandedElts to the packed operands.
5509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5510 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5511 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5512 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5513 if (DemandedElts[OuterIdx])
5514 DemandedLHS.setBit(InnerIdx);
5515 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5516 DemandedRHS.setBit(InnerIdx);
5517 }
5518 }
5519}
5520
5521// Split the demanded elts of a HADD/HSUB node between its operands.
5522static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5523 APInt &DemandedLHS, APInt &DemandedRHS) {
5525 DemandedLHS, DemandedRHS);
5526 DemandedLHS |= DemandedLHS << 1;
5527 DemandedRHS |= DemandedRHS << 1;
5528}
5529
5530/// Calculates the shuffle mask corresponding to the target-specific opcode.
5531/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5532/// operands in \p Ops, and returns true.
5533/// Sets \p IsUnary to true if only one source is used. Note that this will set
5534/// IsUnary for shuffles which use a single input multiple times, and in those
5535/// cases it will adjust the mask to only have indices within that single input.
5536/// It is an error to call this with non-empty Mask/Ops vectors.
5537static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5539 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5540 if (!isTargetShuffle(N.getOpcode()))
5541 return false;
5542
5543 MVT VT = N.getSimpleValueType();
5544 unsigned NumElems = VT.getVectorNumElements();
5545 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 APInt RawUndefs;
5548 uint64_t ImmN;
5549
5550 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5551 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5552
5553 IsUnary = false;
5554 bool IsFakeUnary = false;
5555 switch (N.getOpcode()) {
5556 case X86ISD::BLENDI:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeBLENDMask(NumElems, ImmN, Mask);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::SHUFP:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5566 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5567 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5568 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5569 break;
5570 case X86ISD::INSERTPS:
5571 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5572 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5573 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5574 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5575 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5576 break;
5577 case X86ISD::EXTRQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5580 isa<ConstantSDNode>(N.getOperand(2))) {
5581 int BitLen = N.getConstantOperandVal(1);
5582 int BitIdx = N.getConstantOperandVal(2);
5583 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5584 IsUnary = true;
5585 }
5586 break;
5587 case X86ISD::INSERTQI:
5588 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5590 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5591 isa<ConstantSDNode>(N.getOperand(3))) {
5592 int BitLen = N.getConstantOperandVal(2);
5593 int BitIdx = N.getConstantOperandVal(3);
5594 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5595 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5596 }
5597 break;
5598 case X86ISD::UNPCKH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5602 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5603 break;
5604 case X86ISD::UNPCKL:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::MOVHLPS:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVHLPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVLHPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVLHPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::VALIGN:
5623 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5624 "Only 32-bit and 64-bit elements are supported!");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodeVALIGNMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::PALIGNR:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5637 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5638 DecodePALIGNRMask(NumElems, ImmN, Mask);
5639 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5640 Ops.push_back(N.getOperand(1));
5641 Ops.push_back(N.getOperand(0));
5642 break;
5643 case X86ISD::VSHLDQ:
5644 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSLLDQMask(NumElems, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::VSRLDQ:
5651 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5652 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5653 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5654 DecodePSRLDQMask(NumElems, ImmN, Mask);
5655 IsUnary = true;
5656 break;
5657 case X86ISD::PSHUFD:
5658 case X86ISD::VPERMILPI:
5659 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5660 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5661 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5662 IsUnary = true;
5663 break;
5664 case X86ISD::PSHUFHW:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFLW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::VZEXT_MOVL:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 DecodeZeroMoveLowMask(NumElems, Mask);
5679 IsUnary = true;
5680 break;
5681 case X86ISD::VBROADCAST:
5682 // We only decode broadcasts of same-sized vectors, peeking through to
5683 // extracted subvectors is likely to cause hasOneUse issues with
5684 // SimplifyDemandedBits etc.
5685 if (N.getOperand(0).getValueType() == VT) {
5686 DecodeVectorBroadcast(NumElems, Mask);
5687 IsUnary = true;
5688 break;
5689 }
5690 return false;
5691 case X86ISD::VPERMILPV: {
5692 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5693 IsUnary = true;
5694 SDValue MaskNode = N.getOperand(1);
5695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5696 RawUndefs)) {
5697 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5698 break;
5699 }
5700 return false;
5701 }
5702 case X86ISD::PSHUFB: {
5703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5704 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5705 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5706 IsUnary = true;
5707 SDValue MaskNode = N.getOperand(1);
5708 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5709 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5710 break;
5711 }
5712 return false;
5713 }
5714 case X86ISD::VPERMI:
5715 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERMMask(NumElems, ImmN, Mask);
5718 IsUnary = true;
5719 break;
5720 case X86ISD::MOVSS:
5721 case X86ISD::MOVSD:
5722 case X86ISD::MOVSH:
5723 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5724 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5725 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5726 break;
5727 case X86ISD::VPERM2X128:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5730 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5731 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5732 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5733 break;
5734 case X86ISD::SHUF128:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5737 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5738 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5739 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5740 break;
5741 case X86ISD::MOVSLDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVSLDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::MOVSHDUP:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 DecodeMOVSHDUPMask(NumElems, Mask);
5749 IsUnary = true;
5750 break;
5751 case X86ISD::MOVDDUP:
5752 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5753 DecodeMOVDDUPMask(NumElems, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::VPERMIL2: {
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5759 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5760 SDValue MaskNode = N.getOperand(2);
5761 SDValue CtrlNode = N.getOperand(3);
5762 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5763 unsigned CtrlImm = CtrlOp->getZExtValue();
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5765 RawUndefs)) {
5766 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5767 Mask);
5768 break;
5769 }
5770 }
5771 return false;
5772 }
5773 case X86ISD::VPPERM: {
5774 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5777 SDValue MaskNode = N.getOperand(2);
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5779 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 case X86ISD::VPERMV: {
5785 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5786 IsUnary = true;
5787 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5788 Ops.push_back(N.getOperand(1));
5789 SDValue MaskNode = N.getOperand(0);
5790 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5791 RawUndefs)) {
5792 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5793 break;
5794 }
5795 return false;
5796 }
5797 case X86ISD::VPERMV3: {
5798 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5800 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5801 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5802 Ops.push_back(N.getOperand(0));
5803 Ops.push_back(N.getOperand(2));
5804 SDValue MaskNode = N.getOperand(1);
5805 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5806 RawUndefs)) {
5807 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5808 break;
5809 }
5810 return false;
5811 }
5812 default:
5813 llvm_unreachable("unknown target shuffle node");
5814 }
5815
5816 // Empty mask indicates the decode failed.
5817 if (Mask.empty())
5818 return false;
5819
5820 // Check if we're getting a shuffle mask with zero'd elements.
5821 if (!AllowSentinelZero && isAnyZero(Mask))
5822 return false;
5823
5824 // If we have a fake unary shuffle, the shuffle mask is spread across two
5825 // inputs that are actually the same node. Re-map the mask to always point
5826 // into the first input.
5827 if (IsFakeUnary)
5828 for (int &M : Mask)
5829 if (M >= (int)Mask.size())
5830 M -= Mask.size();
5831
5832 // If we didn't already add operands in the opcode-specific code, default to
5833 // adding 1 or 2 operands starting at 0.
5834 if (Ops.empty()) {
5835 Ops.push_back(N.getOperand(0));
5836 if (!IsUnary || IsFakeUnary)
5837 Ops.push_back(N.getOperand(1));
5838 }
5839
5840 return true;
5841}
5842
5843// Wrapper for getTargetShuffleMask with InUnary;
5844static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5846 SmallVectorImpl<int> &Mask) {
5847 bool IsUnary;
5848 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5849}
5850
5851/// Compute whether each element of a shuffle is zeroable.
5852///
5853/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5854/// Either it is an undef element in the shuffle mask, the element of the input
5855/// referenced is undef, or the element of the input referenced is known to be
5856/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5857/// as many lanes with this technique as possible to simplify the remaining
5858/// shuffle.
5860 SDValue V1, SDValue V2,
5861 APInt &KnownUndef, APInt &KnownZero) {
5862 int Size = Mask.size();
5863 KnownUndef = KnownZero = APInt::getZero(Size);
5864
5865 V1 = peekThroughBitcasts(V1);
5866 V2 = peekThroughBitcasts(V2);
5867
5868 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5869 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5870
5871 int VectorSizeInBits = V1.getValueSizeInBits();
5872 int ScalarSizeInBits = VectorSizeInBits / Size;
5873 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5874
5875 for (int i = 0; i < Size; ++i) {
5876 int M = Mask[i];
5877 // Handle the easy cases.
5878 if (M < 0) {
5879 KnownUndef.setBit(i);
5880 continue;
5881 }
5882 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5883 KnownZero.setBit(i);
5884 continue;
5885 }
5886
5887 // Determine shuffle input and normalize the mask.
5888 SDValue V = M < Size ? V1 : V2;
5889 M %= Size;
5890
5891 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5892 if (V.getOpcode() != ISD::BUILD_VECTOR)
5893 continue;
5894
5895 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5896 // the (larger) source element must be UNDEF/ZERO.
5897 if ((Size % V.getNumOperands()) == 0) {
5898 int Scale = Size / V->getNumOperands();
5899 SDValue Op = V.getOperand(M / Scale);
5900 if (Op.isUndef())
5901 KnownUndef.setBit(i);
5902 if (X86::isZeroNode(Op))
5903 KnownZero.setBit(i);
5904 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5905 APInt Val = Cst->getAPIntValue();
5906 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5907 if (Val == 0)
5908 KnownZero.setBit(i);
5909 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5910 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5911 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5912 if (Val == 0)
5913 KnownZero.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // If the BUILD_VECTOR has more elements then all the (smaller) source
5919 // elements must be UNDEF or ZERO.
5920 if ((V.getNumOperands() % Size) == 0) {
5921 int Scale = V->getNumOperands() / Size;
5922 bool AllUndef = true;
5923 bool AllZero = true;
5924 for (int j = 0; j < Scale; ++j) {
5925 SDValue Op = V.getOperand((M * Scale) + j);
5926 AllUndef &= Op.isUndef();
5927 AllZero &= X86::isZeroNode(Op);
5928 }
5929 if (AllUndef)
5930 KnownUndef.setBit(i);
5931 if (AllZero)
5932 KnownZero.setBit(i);
5933 continue;
5934 }
5935 }
5936}
5937
5938/// Decode a target shuffle mask and inputs and see if any values are
5939/// known to be undef or zero from their inputs.
5940/// Returns true if the target shuffle mask was decoded.
5941/// FIXME: Merge this with computeZeroableShuffleElements?
5944 APInt &KnownUndef, APInt &KnownZero) {
5945 bool IsUnary;
5946 if (!isTargetShuffle(N.getOpcode()))
5947 return false;
5948
5949 MVT VT = N.getSimpleValueType();
5950 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5951 return false;
5952
5953 int Size = Mask.size();
5954 SDValue V1 = Ops[0];
5955 SDValue V2 = IsUnary ? V1 : Ops[1];
5956 KnownUndef = KnownZero = APInt::getZero(Size);
5957
5958 V1 = peekThroughBitcasts(V1);
5959 V2 = peekThroughBitcasts(V2);
5960
5961 assert((VT.getSizeInBits() % Size) == 0 &&
5962 "Illegal split of shuffle value type");
5963 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5964
5965 // Extract known constant input data.
5966 APInt UndefSrcElts[2];
5967 SmallVector<APInt, 32> SrcEltBits[2];
5968 bool IsSrcConstant[2] = {
5969 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5970 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false),
5972 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5973 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5974 /*AllowPartialUndefs*/ false)};
5975
5976 for (int i = 0; i < Size; ++i) {
5977 int M = Mask[i];
5978
5979 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5980 if (M < 0) {
5981 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5982 if (SM_SentinelUndef == M)
5983 KnownUndef.setBit(i);
5984 if (SM_SentinelZero == M)
5985 KnownZero.setBit(i);
5986 continue;
5987 }
5988
5989 // Determine shuffle input and normalize the mask.
5990 unsigned SrcIdx = M / Size;
5991 SDValue V = M < Size ? V1 : V2;
5992 M %= Size;
5993
5994 // We are referencing an UNDEF input.
5995 if (V.isUndef()) {
5996 KnownUndef.setBit(i);
5997 continue;
5998 }
5999
6000 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6001 // TODO: We currently only set UNDEF for integer types - floats use the same
6002 // registers as vectors and many of the scalar folded loads rely on the
6003 // SCALAR_TO_VECTOR pattern.
6004 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6005 (Size % V.getValueType().getVectorNumElements()) == 0) {
6006 int Scale = Size / V.getValueType().getVectorNumElements();
6007 int Idx = M / Scale;
6008 if (Idx != 0 && !VT.isFloatingPoint())
6009 KnownUndef.setBit(i);
6010 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6011 KnownZero.setBit(i);
6012 continue;
6013 }
6014
6015 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6016 // base vectors.
6017 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6018 SDValue Vec = V.getOperand(0);
6019 int NumVecElts = Vec.getValueType().getVectorNumElements();
6020 if (Vec.isUndef() && Size == NumVecElts) {
6021 int Idx = V.getConstantOperandVal(2);
6022 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6023 if (M < Idx || (Idx + NumSubElts) <= M)
6024 KnownUndef.setBit(i);
6025 }
6026 continue;
6027 }
6028
6029 // Attempt to extract from the source's constant bits.
6030 if (IsSrcConstant[SrcIdx]) {
6031 if (UndefSrcElts[SrcIdx][M])
6032 KnownUndef.setBit(i);
6033 else if (SrcEltBits[SrcIdx][M] == 0)
6034 KnownZero.setBit(i);
6035 }
6036 }
6037
6038 assert(VT.getVectorNumElements() == (unsigned)Size &&
6039 "Different mask size from vector size!");
6040 return true;
6041}
6042
6043// Replace target shuffle mask elements with known undef/zero sentinels.
6045 const APInt &KnownUndef,
6046 const APInt &KnownZero,
6047 bool ResolveKnownZeros= true) {
6048 unsigned NumElts = Mask.size();
6049 assert(KnownUndef.getBitWidth() == NumElts &&
6050 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6051
6052 for (unsigned i = 0; i != NumElts; ++i) {
6053 if (KnownUndef[i])
6054 Mask[i] = SM_SentinelUndef;
6055 else if (ResolveKnownZeros && KnownZero[i])
6056 Mask[i] = SM_SentinelZero;
6057 }
6058}
6059
6060// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6062 APInt &KnownUndef,
6063 APInt &KnownZero) {
6064 unsigned NumElts = Mask.size();
6065 KnownUndef = KnownZero = APInt::getZero(NumElts);
6066
6067 for (unsigned i = 0; i != NumElts; ++i) {
6068 int M = Mask[i];
6069 if (SM_SentinelUndef == M)
6070 KnownUndef.setBit(i);
6071 if (SM_SentinelZero == M)
6072 KnownZero.setBit(i);
6073 }
6074}
6075
6076// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6078 SDValue Cond, bool IsBLENDV = false) {
6079 EVT CondVT = Cond.getValueType();
6080 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6081 unsigned NumElts = CondVT.getVectorNumElements();
6082
6083 APInt UndefElts;
6084 SmallVector<APInt, 32> EltBits;
6085 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6086 /*AllowWholeUndefs*/ true,
6087 /*AllowPartialUndefs*/ false))
6088 return false;
6089
6090 Mask.resize(NumElts, SM_SentinelUndef);
6091
6092 for (int i = 0; i != (int)NumElts; ++i) {
6093 Mask[i] = i;
6094 // Arbitrarily choose from the 2nd operand if the select condition element
6095 // is undef.
6096 // TODO: Can we do better by matching patterns such as even/odd?
6097 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6098 (IsBLENDV && EltBits[i].isNonNegative()))
6099 Mask[i] += NumElts;
6100 }
6101
6102 return true;
6103}
6104
6105// Forward declaration (for getFauxShuffleMask recursive check).
6106static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts);
6111
6112// Attempt to decode ops that could be represented as a shuffle mask.
6113// The decoded shuffle mask may contain a different number of elements to the
6114// destination value type.
6115// TODO: Merge into getTargetShuffleInputs()
6116static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6119 const SelectionDAG &DAG, unsigned Depth,
6120 bool ResolveKnownElts) {
6121 Mask.clear();
6122 Ops.clear();
6123
6124 MVT VT = N.getSimpleValueType();
6125 unsigned NumElts = VT.getVectorNumElements();
6126 unsigned NumSizeInBits = VT.getSizeInBits();
6127 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6128 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6129 return false;
6130 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6131 unsigned NumSizeInBytes = NumSizeInBits / 8;
6132 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6133
6134 unsigned Opcode = N.getOpcode();
6135 switch (Opcode) {
6136 case ISD::VECTOR_SHUFFLE: {
6137 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6138 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6139 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6140 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6141 Ops.push_back(N.getOperand(0));
6142 Ops.push_back(N.getOperand(1));
6143 return true;
6144 }
6145 return false;
6146 }
6147 case ISD::AND:
6148 case X86ISD::ANDNP: {
6149 // Attempt to decode as a per-byte mask.
6150 APInt UndefElts;
6151 SmallVector<APInt, 32> EltBits;
6152 SDValue N0 = N.getOperand(0);
6153 SDValue N1 = N.getOperand(1);
6154 bool IsAndN = (X86ISD::ANDNP == Opcode);
6155 uint64_t ZeroMask = IsAndN ? 255 : 0;
6156 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6157 /*AllowWholeUndefs*/ false,
6158 /*AllowPartialUndefs*/ false))
6159 return false;
6160 // We can't assume an undef src element gives an undef dst - the other src
6161 // might be zero.
6162 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6163 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6164 const APInt &ByteBits = EltBits[i];
6165 if (ByteBits != 0 && ByteBits != 255)
6166 return false;
6167 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6168 }
6169 Ops.push_back(IsAndN ? N1 : N0);
6170 return true;
6171 }
6172 case ISD::OR: {
6173 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6174 // is a valid shuffle index.
6175 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6176 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6177 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6178 return false;
6179
6180 SmallVector<int, 64> SrcMask0, SrcMask1;
6181 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6184 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6185 Depth + 1, true) ||
6186 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6187 Depth + 1, true))
6188 return false;
6189
6190 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6191 SmallVector<int, 64> Mask0, Mask1;
6192 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6193 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6194 for (int i = 0; i != (int)MaskSize; ++i) {
6195 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6196 // loops converting between OR and BLEND shuffles due to
6197 // canWidenShuffleElements merging away undef elements, meaning we
6198 // fail to recognise the OR as the undef element isn't known zero.
6199 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6200 Mask.push_back(SM_SentinelZero);
6201 else if (Mask1[i] == SM_SentinelZero)
6202 Mask.push_back(i);
6203 else if (Mask0[i] == SM_SentinelZero)
6204 Mask.push_back(i + MaskSize);
6205 else
6206 return false;
6207 }
6208 Ops.push_back(N.getOperand(0));
6209 Ops.push_back(N.getOperand(1));
6210 return true;
6211 }
6212 case ISD::CONCAT_VECTORS: {
6213 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6214 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6215 if (NumBitsPerElt == 64) {
6216 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6217 for (unsigned M = 0; M != NumSubElts; ++M)
6218 Mask.push_back((I * NumElts) + M);
6219 Ops.push_back(N.getOperand(I));
6220 }
6221 return true;
6222 }
6223 return false;
6224 }
6225 case ISD::INSERT_SUBVECTOR: {
6226 SDValue Src = N.getOperand(0);
6227 SDValue Sub = N.getOperand(1);
6228 EVT SubVT = Sub.getValueType();
6229 unsigned NumSubElts = SubVT.getVectorNumElements();
6230 uint64_t InsertIdx = N.getConstantOperandVal(2);
6231 // Subvector isn't demanded - just return the base vector.
6232 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.end(), 0);
6235 Ops.push_back(Src);
6236 return true;
6237 }
6238 // Handle CONCAT(SUB0, SUB1).
6239 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6240 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6241 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6242 Src.getOperand(0).isUndef() &&
6243 Src.getOperand(1).getValueType() == SubVT &&
6244 Src.getConstantOperandVal(2) == 0 &&
6245 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6246 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6247 Mask.resize(NumElts);
6248 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6249 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6250 Ops.push_back(Src.getOperand(1));
6251 Ops.push_back(Sub);
6252 return true;
6253 }
6254 if (!N->isOnlyUserOf(Sub.getNode()))
6255 return false;
6256
6257 SmallVector<int, 64> SubMask;
6258 SmallVector<SDValue, 2> SubInputs;
6260 EVT SubSrcVT = SubSrc.getValueType();
6261 if (!SubSrcVT.isVector())
6262 return false;
6263
6264 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6265 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6266 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6267 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6268 SDValue SubSrcSrc = SubSrc.getOperand(0);
6269 unsigned NumSubSrcSrcElts =
6270 SubSrcSrc.getValueType().getVectorNumElements();
6271 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6272 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6273 "Subvector valuetype mismatch");
6274 InsertIdx *= (MaxElts / NumElts);
6275 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6276 NumSubElts *= (MaxElts / NumElts);
6277 bool SrcIsUndef = Src.isUndef();
6278 for (int i = 0; i != (int)MaxElts; ++i)
6279 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6280 for (int i = 0; i != (int)NumSubElts; ++i)
6281 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6282 if (!SrcIsUndef)
6283 Ops.push_back(Src);
6284 Ops.push_back(SubSrcSrc);
6285 return true;
6286 }
6287
6288 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6289 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6290 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6291 Depth + 1, ResolveKnownElts))
6292 return false;
6293
6294 // Subvector shuffle inputs must not be larger than the subvector.
6295 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6296 return SubVT.getFixedSizeInBits() <
6297 SubInput.getValueSizeInBits().getFixedValue();
6298 }))
6299 return false;
6300
6301 if (SubMask.size() != NumSubElts) {
6302 assert(((SubMask.size() % NumSubElts) == 0 ||
6303 (NumSubElts % SubMask.size()) == 0) &&
6304 "Illegal submask scale");
6305 if ((NumSubElts % SubMask.size()) == 0) {
6306 int Scale = NumSubElts / SubMask.size();
6307 SmallVector<int, 64> ScaledSubMask;
6308 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6309 SubMask = ScaledSubMask;
6310 } else {
6311 int Scale = SubMask.size() / NumSubElts;
6312 NumSubElts = SubMask.size();
6313 NumElts *= Scale;
6314 InsertIdx *= Scale;
6315 }
6316 }
6317 Ops.push_back(Src);
6318 Ops.append(SubInputs.begin(), SubInputs.end());
6319 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6320 Mask.append(NumElts, SM_SentinelZero);
6321 else
6322 for (int i = 0; i != (int)NumElts; ++i)
6323 Mask.push_back(i);
6324 for (int i = 0; i != (int)NumSubElts; ++i) {
6325 int M = SubMask[i];
6326 if (0 <= M) {
6327 int InputIdx = M / NumSubElts;
6328 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6329 }
6330 Mask[i + InsertIdx] = M;
6331 }
6332 return true;
6333 }
6334 case X86ISD::PINSRB:
6335 case X86ISD::PINSRW:
6338 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6339 // vector, for matching src/dst vector types.
6340 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6341
6342 unsigned DstIdx = 0;
6343 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6344 // Check we have an in-range constant insertion index.
6345 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6346 N.getConstantOperandAPInt(2).uge(NumElts))
6347 return false;
6348 DstIdx = N.getConstantOperandVal(2);
6349
6350 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6351 if (X86::isZeroNode(Scl)) {
6352 Ops.push_back(N.getOperand(0));
6353 for (unsigned i = 0; i != NumElts; ++i)
6354 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6355 return true;
6356 }
6357 }
6358
6359 // Peek through trunc/aext/zext/bitcast.
6360 // TODO: aext shouldn't require SM_SentinelZero padding.
6361 // TODO: handle shift of scalars.
6362 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6363 while (Scl.getOpcode() == ISD::TRUNCATE ||
6364 Scl.getOpcode() == ISD::ANY_EXTEND ||
6365 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6366 (Scl.getOpcode() == ISD::BITCAST &&
6369 Scl = Scl.getOperand(0);
6370 MinBitsPerElt =
6371 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6372 }
6373 if ((MinBitsPerElt % 8) != 0)
6374 return false;
6375
6376 // Attempt to find the source vector the scalar was extracted from.
6377 SDValue SrcExtract;
6378 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6379 Scl.getOpcode() == X86ISD::PEXTRW ||
6380 Scl.getOpcode() == X86ISD::PEXTRB) &&
6381 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6382 SrcExtract = Scl;
6383 }
6384 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6385 return false;
6386
6387 SDValue SrcVec = SrcExtract.getOperand(0);
6388 EVT SrcVT = SrcVec.getValueType();
6389 if (!SrcVT.getScalarType().isByteSized())
6390 return false;
6391 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6392 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6393 unsigned DstByte = DstIdx * NumBytesPerElt;
6394 MinBitsPerElt =
6395 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6396
6397 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6398 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6399 Ops.push_back(SrcVec);
6400 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6401 } else {
6402 Ops.push_back(SrcVec);
6403 Ops.push_back(N.getOperand(0));
6404 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6405 Mask.push_back(NumSizeInBytes + i);
6406 }
6407
6408 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6409 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6410 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6411 Mask[DstByte + i] = SrcByte + i;
6412 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6413 Mask[DstByte + i] = SM_SentinelZero;
6414 return true;
6415 }
6416 case X86ISD::PACKSS:
6417 case X86ISD::PACKUS: {
6418 SDValue N0 = N.getOperand(0);
6419 SDValue N1 = N.getOperand(1);
6420 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6422 "Unexpected input value type");
6423
6424 APInt EltsLHS, EltsRHS;
6425 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6426
6427 // If we know input saturation won't happen (or we don't care for particular
6428 // lanes), we can treat this as a truncation shuffle.
6429 bool Offset0 = false, Offset1 = false;
6430 if (Opcode == X86ISD::PACKSS) {
6431 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6432 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6433 (!(N1.isUndef() || EltsRHS.isZero()) &&
6434 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6435 return false;
6436 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6437 // PACKSS then it was likely being used for sign-extension for a
6438 // truncation, so just peek through and adjust the mask accordingly.
6439 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6440 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6441 Offset0 = true;
6442 N0 = N0.getOperand(0);
6443 }
6444 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6445 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6446 Offset1 = true;
6447 N1 = N1.getOperand(0);
6448 }
6449 } else {
6450 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6451 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6452 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6453 (!(N1.isUndef() || EltsRHS.isZero()) &&
6454 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6455 return false;
6456 }
6457
6458 bool IsUnary = (N0 == N1);
6459
6460 Ops.push_back(N0);
6461 if (!IsUnary)
6462 Ops.push_back(N1);
6463
6464 createPackShuffleMask(VT, Mask, IsUnary);
6465
6466 if (Offset0 || Offset1) {
6467 for (int &M : Mask)
6468 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6469 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6470 ++M;
6471 }
6472 return true;
6473 }
6474 case ISD::VSELECT:
6475 case X86ISD::BLENDV: {
6476 SDValue Cond = N.getOperand(0);
6477 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6478 Ops.push_back(N.getOperand(1));
6479 Ops.push_back(N.getOperand(2));
6480 return true;
6481 }
6482 return false;
6483 }
6484 case X86ISD::VTRUNC: {
6485 SDValue Src = N.getOperand(0);
6486 EVT SrcVT = Src.getValueType();
6487 if (SrcVT.getSizeInBits() != NumSizeInBits)
6488 return false;
6489 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6490 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6491 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6492 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6493 for (unsigned i = 0; i != NumSrcElts; ++i)
6494 Mask.push_back(i * Scale);
6495 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6496 Ops.push_back(Src);
6497 return true;
6498 }
6499 case ISD::SHL:
6500 case ISD::SRL: {
6501 APInt UndefElts;
6502 SmallVector<APInt, 32> EltBits;
6503 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6504 UndefElts, EltBits,
6505 /*AllowWholeUndefs*/ true,
6506 /*AllowPartialUndefs*/ false))
6507 return false;
6508
6509 // We can only decode 'whole byte' bit shifts as shuffles.
6510 for (unsigned I = 0; I != NumElts; ++I)
6511 if (DemandedElts[I] && !UndefElts[I] &&
6512 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6513 return false;
6514
6515 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6516 Ops.push_back(N.getOperand(0));
6517
6518 for (unsigned I = 0; I != NumElts; ++I) {
6519 if (!DemandedElts[I] || UndefElts[I])
6520 continue;
6521 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6522 unsigned Lo = I * NumBytesPerElt;
6523 unsigned Hi = Lo + NumBytesPerElt;
6524 // Clear mask to all zeros and insert the shifted byte indices.
6525 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6526 if (ISD::SHL == Opcode)
6527 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6528 else
6529 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6530 Lo + ByteShift);
6531 }
6532 return true;
6533 }
6534 case X86ISD::VSHLI:
6535 case X86ISD::VSRLI: {
6536 uint64_t ShiftVal = N.getConstantOperandVal(1);
6537 // Out of range bit shifts are guaranteed to be zero.
6538 if (NumBitsPerElt <= ShiftVal) {
6539 Mask.append(NumElts, SM_SentinelZero);
6540 return true;
6541 }
6542
6543 // We can only decode 'whole byte' bit shifts as shuffles.
6544 if ((ShiftVal % 8) != 0)
6545 break;
6546
6547 uint64_t ByteShift = ShiftVal / 8;
6548 Ops.push_back(N.getOperand(0));
6549
6550 // Clear mask to all zeros and insert the shifted byte indices.
6551 Mask.append(NumSizeInBytes, SM_SentinelZero);
6552
6553 if (X86ISD::VSHLI == Opcode) {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j] = i + j - ByteShift;
6557 } else {
6558 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6559 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6560 Mask[i + j - ByteShift] = i + j;
6561 }
6562 return true;
6563 }
6564 case X86ISD::VROTLI:
6565 case X86ISD::VROTRI: {
6566 // We can only decode 'whole byte' bit rotates as shuffles.
6567 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6568 if ((RotateVal % 8) != 0)
6569 return false;
6570 Ops.push_back(N.getOperand(0));
6571 int Offset = RotateVal / 8;
6572 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6573 for (int i = 0; i != (int)NumElts; ++i) {
6574 int BaseIdx = i * NumBytesPerElt;
6575 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6576 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6577 }
6578 }
6579 return true;
6580 }
6581 case X86ISD::VBROADCAST: {
6582 SDValue Src = N.getOperand(0);
6583 if (!Src.getSimpleValueType().isVector()) {
6584 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6585 !isNullConstant(Src.getOperand(1)) ||
6586 Src.getOperand(0).getValueType().getScalarType() !=
6587 VT.getScalarType())
6588 return false;
6589 Src = Src.getOperand(0);
6590 }
6591 Ops.push_back(Src);
6592 Mask.append(NumElts, 0);
6593 return true;
6594 }
6596 SDValue Src = N.getOperand(0);
6597 EVT SrcVT = Src.getValueType();
6598 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6599
6600 // Extended source must be a simple vector.
6601 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6602 (NumBitsPerSrcElt % 8) != 0)
6603 return false;
6604
6605 // We can only handle all-signbits extensions.
6606 APInt DemandedSrcElts =
6607 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6608 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6609 return false;
6610
6611 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6612 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6613 for (unsigned I = 0; I != NumElts; ++I)
6614 Mask.append(Scale, I);
6615 Ops.push_back(Src);
6616 return true;
6617 }
6618 case ISD::ZERO_EXTEND:
6619 case ISD::ANY_EXTEND:
6622 SDValue Src = N.getOperand(0);
6623 EVT SrcVT = Src.getValueType();
6624
6625 // Extended source must be a simple vector.
6626 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6627 (SrcVT.getScalarSizeInBits() % 8) != 0)
6628 return false;
6629
6630 bool IsAnyExtend =
6631 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6632 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6633 IsAnyExtend, Mask);
6634 Ops.push_back(Src);
6635 return true;
6636 }
6637 }
6638
6639 return false;
6640}
6641
6642/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6644 SmallVectorImpl<int> &Mask) {
6645 int MaskWidth = Mask.size();
6646 SmallVector<SDValue, 16> UsedInputs;
6647 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6648 int lo = UsedInputs.size() * MaskWidth;
6649 int hi = lo + MaskWidth;
6650
6651 // Strip UNDEF input usage.
6652 if (Inputs[i].isUndef())
6653 for (int &M : Mask)
6654 if ((lo <= M) && (M < hi))
6655 M = SM_SentinelUndef;
6656
6657 // Check for unused inputs.
6658 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6659 for (int &M : Mask)
6660 if (lo <= M)
6661 M -= MaskWidth;
6662 continue;
6663 }
6664
6665 // Check for repeated inputs.
6666 bool IsRepeat = false;
6667 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6668 if (UsedInputs[j] != Inputs[i])
6669 continue;
6670 for (int &M : Mask)
6671 if (lo <= M)
6672 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6673 IsRepeat = true;
6674 break;
6675 }
6676 if (IsRepeat)
6677 continue;
6678
6679 UsedInputs.push_back(Inputs[i]);
6680 }
6681 Inputs = UsedInputs;
6682}
6683
6684/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6685/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6686/// Returns true if the target shuffle mask was decoded.
6687static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6690 APInt &KnownUndef, APInt &KnownZero,
6691 const SelectionDAG &DAG, unsigned Depth,
6692 bool ResolveKnownElts) {
6694 return false; // Limit search depth.
6695
6696 EVT VT = Op.getValueType();
6697 if (!VT.isSimple() || !VT.isVector())
6698 return false;
6699
6700 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6701 if (ResolveKnownElts)
6702 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6703 return true;
6704 }
6705 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6706 ResolveKnownElts)) {
6707 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6708 return true;
6709 }
6710 return false;
6711}
6712
6713static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6716 const SelectionDAG &DAG, unsigned Depth,
6717 bool ResolveKnownElts) {
6718 APInt KnownUndef, KnownZero;
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6720 KnownZero, DAG, Depth, ResolveKnownElts);
6721}
6722
6725 const SelectionDAG &DAG, unsigned Depth = 0,
6726 bool ResolveKnownElts = true) {
6727 EVT VT = Op.getValueType();
6728 if (!VT.isSimple() || !VT.isVector())
6729 return false;
6730
6731 unsigned NumElts = Op.getValueType().getVectorNumElements();
6732 APInt DemandedElts = APInt::getAllOnes(NumElts);
6733 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6734 ResolveKnownElts);
6735}
6736
6737// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6738static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6739 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6740 SelectionDAG &DAG) {
6741 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6742 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6743 "Unknown broadcast load type");
6744
6745 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6746 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6747 return SDValue();
6748
6751 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6752 SDValue Ops[] = {Mem->getChain(), Ptr};
6753 SDValue BcstLd = DAG.getMemIntrinsicNode(
6754 Opcode, DL, Tys, Ops, MemVT,
6756 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6757 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6758 return BcstLd;
6759}
6760
6761/// Returns the scalar element that will make up the i'th
6762/// element of the result of the vector shuffle.
6763static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6764 SelectionDAG &DAG, unsigned Depth) {
6766 return SDValue(); // Limit search depth.
6767
6768 EVT VT = Op.getValueType();
6769 unsigned Opcode = Op.getOpcode();
6770 unsigned NumElems = VT.getVectorNumElements();
6771
6772 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6773 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6774 int Elt = SV->getMaskElt(Index);
6775
6776 if (Elt < 0)
6777 return DAG.getUNDEF(VT.getVectorElementType());
6778
6779 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6780 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6781 }
6782
6783 // Recurse into target specific vector shuffles to find scalars.
6784 if (isTargetShuffle(Opcode)) {
6785 MVT ShufVT = VT.getSimpleVT();
6786 MVT ShufSVT = ShufVT.getVectorElementType();
6787 int NumElems = (int)ShufVT.getVectorNumElements();
6788 SmallVector<int, 16> ShuffleMask;
6790 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6791 return SDValue();
6792
6793 int Elt = ShuffleMask[Index];
6794 if (Elt == SM_SentinelZero)
6795 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6796 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6797 if (Elt == SM_SentinelUndef)
6798 return DAG.getUNDEF(ShufSVT);
6799
6800 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6801 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6802 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6803 }
6804
6805 // Recurse into insert_subvector base/sub vector to find scalars.
6806 if (Opcode == ISD::INSERT_SUBVECTOR) {
6807 SDValue Vec = Op.getOperand(0);
6808 SDValue Sub = Op.getOperand(1);
6809 uint64_t SubIdx = Op.getConstantOperandVal(2);
6810 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6811
6812 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6813 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6814 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6815 }
6816
6817 // Recurse into concat_vectors sub vector to find scalars.
6818 if (Opcode == ISD::CONCAT_VECTORS) {
6819 EVT SubVT = Op.getOperand(0).getValueType();
6820 unsigned NumSubElts = SubVT.getVectorNumElements();
6821 uint64_t SubIdx = Index / NumSubElts;
6822 uint64_t SubElt = Index % NumSubElts;
6823 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6824 }
6825
6826 // Recurse into extract_subvector src vector to find scalars.
6827 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6828 SDValue Src = Op.getOperand(0);
6829 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6830 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6831 }
6832
6833 // We only peek through bitcasts of the same vector width.
6834 if (Opcode == ISD::BITCAST) {
6835 SDValue Src = Op.getOperand(0);
6836 EVT SrcVT = Src.getValueType();
6837 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6838 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6839 return SDValue();
6840 }
6841
6842 // Actual nodes that may contain scalar elements
6843
6844 // For insert_vector_elt - either return the index matching scalar or recurse
6845 // into the base vector.
6846 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6847 isa<ConstantSDNode>(Op.getOperand(2))) {
6848 if (Op.getConstantOperandAPInt(2) == Index)
6849 return Op.getOperand(1);
6850 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6851 }
6852
6853 if (Opcode == ISD::SCALAR_TO_VECTOR)
6854 return (Index == 0) ? Op.getOperand(0)
6855 : DAG.getUNDEF(VT.getVectorElementType());
6856
6857 if (Opcode == ISD::BUILD_VECTOR)
6858 return Op.getOperand(Index);
6859
6860 return SDValue();
6861}
6862
6863// Use PINSRB/PINSRW/PINSRD to create a build vector.
6865 const APInt &NonZeroMask,
6866 unsigned NumNonZero, unsigned NumZero,
6867 SelectionDAG &DAG,
6868 const X86Subtarget &Subtarget) {
6869 MVT VT = Op.getSimpleValueType();
6870 unsigned NumElts = VT.getVectorNumElements();
6871 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6872 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6873 "Illegal vector insertion");
6874
6875 SDValue V;
6876 bool First = true;
6877
6878 for (unsigned i = 0; i < NumElts; ++i) {
6879 bool IsNonZero = NonZeroMask[i];
6880 if (!IsNonZero)
6881 continue;
6882
6883 // If the build vector contains zeros or our first insertion is not the
6884 // first index then insert into zero vector to break any register
6885 // dependency else use SCALAR_TO_VECTOR.
6886 if (First) {
6887 First = false;
6888 if (NumZero || 0 != i)
6889 V = getZeroVector(VT, Subtarget, DAG, DL);
6890 else {
6891 assert(0 == i && "Expected insertion into zero-index");
6892 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6893 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6894 V = DAG.getBitcast(VT, V);
6895 continue;
6896 }
6897 }
6898 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6899 DAG.getVectorIdxConstant(i, DL));
6900 }
6901
6902 return V;
6903}
6904
6905/// Custom lower build_vector of v16i8.
6907 const APInt &NonZeroMask,
6908 unsigned NumNonZero, unsigned NumZero,
6909 SelectionDAG &DAG,
6910 const X86Subtarget &Subtarget) {
6911 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 return SDValue();
6913
6914 // SSE4.1 - use PINSRB to insert each byte directly.
6915 if (Subtarget.hasSSE41())
6916 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6917 DAG, Subtarget);
6918
6919 SDValue V;
6920
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6923 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6924 !NonZeroMask.extractBits(2, 2).isZero()) {
6925 for (unsigned I = 0; I != 4; ++I) {
6926 if (!NonZeroMask[I])
6927 continue;
6928 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6929 if (I != 0)
6930 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6931 DAG.getConstant(I * 8, DL, MVT::i8));
6932 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6933 }
6934 assert(V && "Failed to fold v16i8 vector to zero");
6935 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6936 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6937 V = DAG.getBitcast(MVT::v8i16, V);
6938 }
6939 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6940 bool ThisIsNonZero = NonZeroMask[i];
6941 bool NextIsNonZero = NonZeroMask[i + 1];
6942 if (!ThisIsNonZero && !NextIsNonZero)
6943 continue;
6944
6945 SDValue Elt;
6946 if (ThisIsNonZero) {
6947 if (NumZero || NextIsNonZero)
6948 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6949 else
6950 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6951 }
6952
6953 if (NextIsNonZero) {
6954 SDValue NextElt = Op.getOperand(i + 1);
6955 if (i == 0 && NumZero)
6956 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6957 else
6958 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6959 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6960 DAG.getConstant(8, DL, MVT::i8));
6961 if (ThisIsNonZero)
6962 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6963 else
6964 Elt = NextElt;
6965 }
6966
6967 // If our first insertion is not the first index or zeros are needed, then
6968 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6969 // elements undefined).
6970 if (!V) {
6971 if (i != 0 || NumZero)
6972 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6973 else {
6974 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6975 V = DAG.getBitcast(MVT::v8i16, V);
6976 continue;
6977 }
6978 }
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6981 DAG.getVectorIdxConstant(i / 2, DL));
6982 }
6983
6984 return DAG.getBitcast(MVT::v16i8, V);
6985}
6986
6987/// Custom lower build_vector of v8i16.
6989 const APInt &NonZeroMask,
6990 unsigned NumNonZero, unsigned NumZero,
6991 SelectionDAG &DAG,
6992 const X86Subtarget &Subtarget) {
6993 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6994 return SDValue();
6995
6996 // Use PINSRW to insert each byte directly.
6997 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6998 Subtarget);
6999}
7000
7001/// Custom lower build_vector of v4i32 or v4f32.
7003 SelectionDAG &DAG,
7004 const X86Subtarget &Subtarget) {
7005 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7006 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7007 // Because we're creating a less complicated build vector here, we may enable
7008 // further folding of the MOVDDUP via shuffle transforms.
7009 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7010 Op.getOperand(0) == Op.getOperand(2) &&
7011 Op.getOperand(1) == Op.getOperand(3) &&
7012 Op.getOperand(0) != Op.getOperand(1)) {
7013 MVT VT = Op.getSimpleValueType();
7014 MVT EltVT = VT.getVectorElementType();
7015 // Create a new build vector with the first 2 elements followed by undef
7016 // padding, bitcast to v2f64, duplicate, and bitcast back.
7017 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7018 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7019 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7020 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7021 return DAG.getBitcast(VT, Dup);
7022 }
7023
7024 // Find all zeroable elements.
7025 std::bitset<4> Zeroable, Undefs;
7026 for (int i = 0; i < 4; ++i) {
7027 SDValue Elt = Op.getOperand(i);
7028 Undefs[i] = Elt.isUndef();
7029 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7030 }
7031 assert(Zeroable.size() - Zeroable.count() > 1 &&
7032 "We expect at least two non-zero elements!");
7033
7034 // We only know how to deal with build_vector nodes where elements are either
7035 // zeroable or extract_vector_elt with constant index.
7036 SDValue FirstNonZero;
7037 unsigned FirstNonZeroIdx;
7038 for (unsigned i = 0; i < 4; ++i) {
7039 if (Zeroable[i])
7040 continue;
7041 SDValue Elt = Op.getOperand(i);
7042 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7044 return SDValue();
7045 // Make sure that this node is extracting from a 128-bit vector.
7046 MVT VT = Elt.getOperand(0).getSimpleValueType();
7047 if (!VT.is128BitVector())
7048 return SDValue();
7049 if (!FirstNonZero.getNode()) {
7050 FirstNonZero = Elt;
7051 FirstNonZeroIdx = i;
7052 }
7053 }
7054
7055 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7056 SDValue V1 = FirstNonZero.getOperand(0);
7057 MVT VT = V1.getSimpleValueType();
7058
7059 // See if this build_vector can be lowered as a blend with zero.
7060 SDValue Elt;
7061 unsigned EltMaskIdx, EltIdx;
7062 int Mask[4];
7063 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7064 if (Zeroable[EltIdx]) {
7065 // The zero vector will be on the right hand side.
7066 Mask[EltIdx] = EltIdx+4;
7067 continue;
7068 }
7069
7070 Elt = Op->getOperand(EltIdx);
7071 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7072 EltMaskIdx = Elt.getConstantOperandVal(1);
7073 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7074 break;
7075 Mask[EltIdx] = EltIdx;
7076 }
7077
7078 if (EltIdx == 4) {
7079 // Let the shuffle legalizer deal with blend operations.
7080 SDValue VZeroOrUndef = (Zeroable == Undefs)
7081 ? DAG.getUNDEF(VT)
7082 : getZeroVector(VT, Subtarget, DAG, DL);
7083 if (V1.getSimpleValueType() != VT)
7084 V1 = DAG.getBitcast(VT, V1);
7085 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7086 }
7087
7088 // See if we can lower this build_vector to a INSERTPS.
7089 if (!Subtarget.hasSSE41())
7090 return SDValue();
7091
7092 SDValue V2 = Elt.getOperand(0);
7093 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7094 V1 = SDValue();
7095
7096 bool CanFold = true;
7097 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7098 if (Zeroable[i])
7099 continue;
7100
7101 SDValue Current = Op->getOperand(i);
7102 SDValue SrcVector = Current->getOperand(0);
7103 if (!V1.getNode())
7104 V1 = SrcVector;
7105 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7106 }
7107
7108 if (!CanFold)
7109 return SDValue();
7110
7111 assert(V1.getNode() && "Expected at least two non-zero elements!");
7112 if (V1.getSimpleValueType() != MVT::v4f32)
7113 V1 = DAG.getBitcast(MVT::v4f32, V1);
7114 if (V2.getSimpleValueType() != MVT::v4f32)
7115 V2 = DAG.getBitcast(MVT::v4f32, V2);
7116
7117 // Ok, we can emit an INSERTPS instruction.
7118 unsigned ZMask = Zeroable.to_ulong();
7119
7120 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7121 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7122 SDValue Result =
7123 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7124 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7125 return DAG.getBitcast(VT, Result);
7126}
7127
7128/// Return a vector logical shift node.
7129static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7130 SelectionDAG &DAG, const TargetLowering &TLI,
7131 const SDLoc &dl) {
7132 assert(VT.is128BitVector() && "Unknown type for VShift");
7133 MVT ShVT = MVT::v16i8;
7134 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7135 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7136 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7137 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7138 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7139}
7140
7142 SelectionDAG &DAG) {
7143
7144 // Check if the scalar load can be widened into a vector load. And if
7145 // the address is "base + cst" see if the cst can be "absorbed" into
7146 // the shuffle mask.
7148 SDValue Ptr = LD->getBasePtr();
7149 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7150 return SDValue();
7151 EVT PVT = LD->getValueType(0);
7152 if (PVT != MVT::i32 && PVT != MVT::f32)
7153 return SDValue();
7154
7155 int FI = -1;
7156 int64_t Offset = 0;
7158 FI = FINode->getIndex();
7159 Offset = 0;
7160 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7161 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7162 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7163 Offset = Ptr.getConstantOperandVal(1);
7164 Ptr = Ptr.getOperand(0);
7165 } else {
7166 return SDValue();
7167 }
7168
7169 // FIXME: 256-bit vector instructions don't require a strict alignment,
7170 // improve this code to support it better.
7171 Align RequiredAlign(VT.getSizeInBits() / 8);
7172 SDValue Chain = LD->getChain();
7173 // Make sure the stack object alignment is at least 16 or 32.
7175 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7176 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7177 if (MFI.isFixedObjectIndex(FI)) {
7178 // Can't change the alignment. FIXME: It's possible to compute
7179 // the exact stack offset and reference FI + adjust offset instead.
7180 // If someone *really* cares about this. That's the way to implement it.
7181 return SDValue();
7182 } else {
7183 MFI.setObjectAlignment(FI, RequiredAlign);
7184 }
7185 }
7186
7187 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7188 // Ptr + (Offset & ~15).
7189 if (Offset < 0)
7190 return SDValue();
7191 if ((Offset % RequiredAlign.value()) & 3)
7192 return SDValue();
7193 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7194 if (StartOffset) {
7195 SDLoc DL(Ptr);
7196 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7197 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7198 }
7199
7200 int EltNo = (Offset - StartOffset) >> 2;
7201 unsigned NumElems = VT.getVectorNumElements();
7202
7203 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7204 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7205 LD->getPointerInfo().getWithOffset(StartOffset));
7206
7207 SmallVector<int, 8> Mask(NumElems, EltNo);
7208
7209 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7210 }
7211
7212 return SDValue();
7213}
7214
7215// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7216static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7217 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7218 auto *BaseLd = cast<LoadSDNode>(Elt);
7219 if (!BaseLd->isSimple())
7220 return false;
7221 Ld = BaseLd;
7222 ByteOffset = 0;
7223 return true;
7224 }
7225
7226 switch (Elt.getOpcode()) {
7227 case ISD::BITCAST:
7228 case ISD::TRUNCATE:
7230 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7231 case ISD::SRL:
7232 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7233 uint64_t Amt = AmtC->getZExtValue();
7234 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7235 ByteOffset += Amt / 8;
7236 return true;
7237 }
7238 }
7239 break;
7241 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7242 SDValue Src = Elt.getOperand(0);
7243 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7244 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7245 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7246 findEltLoadSrc(Src, Ld, ByteOffset)) {
7247 uint64_t Idx = IdxC->getZExtValue();
7248 ByteOffset += Idx * (SrcSizeInBits / 8);
7249 return true;
7250 }
7251 }
7252 break;
7253 }
7254
7255 return false;
7256}
7257
7258/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7259/// elements can be replaced by a single large load which has the same value as
7260/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7261///
7262/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7264 const SDLoc &DL, SelectionDAG &DAG,
7265 const X86Subtarget &Subtarget,
7266 bool IsAfterLegalize) {
7267 if ((VT.getScalarSizeInBits() % 8) != 0)
7268 return SDValue();
7269
7270 unsigned NumElems = Elts.size();
7271
7272 int LastLoadedElt = -1;
7273 APInt LoadMask = APInt::getZero(NumElems);
7274 APInt ZeroMask = APInt::getZero(NumElems);
7275 APInt UndefMask = APInt::getZero(NumElems);
7276
7277 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7278 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7279
7280 // For each element in the initializer, see if we've found a load, zero or an
7281 // undef.
7282 for (unsigned i = 0; i < NumElems; ++i) {
7283 SDValue Elt = peekThroughBitcasts(Elts[i]);
7284 if (!Elt.getNode())
7285 return SDValue();
7286 if (Elt.isUndef()) {
7287 UndefMask.setBit(i);
7288 continue;
7289 }
7291 ZeroMask.setBit(i);
7292 continue;
7293 }
7294
7295 // Each loaded element must be the correct fractional portion of the
7296 // requested vector load.
7297 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7298 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7299 return SDValue();
7300
7301 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7302 return SDValue();
7303 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7304 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7305 return SDValue();
7306
7307 LoadMask.setBit(i);
7308 LastLoadedElt = i;
7309 }
7310 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7311 NumElems &&
7312 "Incomplete element masks");
7313
7314 // Handle Special Cases - all undef or undef/zero.
7315 if (UndefMask.popcount() == NumElems)
7316 return DAG.getUNDEF(VT);
7317 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7318 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7319 : DAG.getConstantFP(0.0, DL, VT);
7320
7321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7322 int FirstLoadedElt = LoadMask.countr_zero();
7323 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7324 EVT EltBaseVT = EltBase.getValueType();
7325 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7326 "Register/Memory size mismatch");
7327 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7328 assert(LDBase && "Did not find base load for merging consecutive loads");
7329 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7330 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7331 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7332 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7333 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7334
7335 // TODO: Support offsetting the base load.
7336 if (ByteOffsets[FirstLoadedElt] != 0)
7337 return SDValue();
7338
7339 // Check to see if the element's load is consecutive to the base load
7340 // or offset from a previous (already checked) load.
7341 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7342 LoadSDNode *Ld = Loads[EltIdx];
7343 int64_t ByteOffset = ByteOffsets[EltIdx];
7344 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7345 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7346 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7347 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7348 }
7349 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7350 EltIdx - FirstLoadedElt);
7351 };
7352
7353 // Consecutive loads can contain UNDEFS but not ZERO elements.
7354 // Consecutive loads with UNDEFs and ZEROs elements require a
7355 // an additional shuffle stage to clear the ZERO elements.
7356 bool IsConsecutiveLoad = true;
7357 bool IsConsecutiveLoadWithZeros = true;
7358 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7359 if (LoadMask[i]) {
7360 if (!CheckConsecutiveLoad(LDBase, i)) {
7361 IsConsecutiveLoad = false;
7362 IsConsecutiveLoadWithZeros = false;
7363 break;
7364 }
7365 } else if (ZeroMask[i]) {
7366 IsConsecutiveLoad = false;
7367 }
7368 }
7369
7370 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7371 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7372 assert(LDBase->isSimple() &&
7373 "Cannot merge volatile or atomic loads.");
7374 SDValue NewLd =
7375 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7376 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7377 for (auto *LD : Loads)
7378 if (LD)
7379 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7380 return NewLd;
7381 };
7382
7383 // Check if the base load is entirely dereferenceable.
7384 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7385 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7386
7387 // LOAD - all consecutive load/undefs (must start/end with a load or be
7388 // entirely dereferenceable). If we have found an entire vector of loads and
7389 // undefs, then return a large load of the entire vector width starting at the
7390 // base pointer. If the vector contains zeros, then attempt to shuffle those
7391 // elements.
7392 if (FirstLoadedElt == 0 &&
7393 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7394 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7395 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7396 return SDValue();
7397
7398 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7399 // will lower to regular temporal loads and use the cache.
7400 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7401 VT.is256BitVector() && !Subtarget.hasInt256())
7402 return SDValue();
7403
7404 if (NumElems == 1)
7405 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7406
7407 if (!ZeroMask)
7408 return CreateLoad(VT, LDBase);
7409
7410 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7411 // vector and a zero vector to clear out the zero elements.
7412 if (!IsAfterLegalize && VT.isVector()) {
7413 unsigned NumMaskElts = VT.getVectorNumElements();
7414 if ((NumMaskElts % NumElems) == 0) {
7415 unsigned Scale = NumMaskElts / NumElems;
7416 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7417 for (unsigned i = 0; i < NumElems; ++i) {
7418 if (UndefMask[i])
7419 continue;
7420 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7421 for (unsigned j = 0; j != Scale; ++j)
7422 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7423 }
7424 SDValue V = CreateLoad(VT, LDBase);
7425 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7426 : DAG.getConstantFP(0.0, DL, VT);
7427 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7428 }
7429 }
7430 }
7431
7432 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7433 if (VT.is256BitVector() || VT.is512BitVector()) {
7434 unsigned HalfNumElems = NumElems / 2;
7435 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7436 EVT HalfVT =
7437 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7438 SDValue HalfLD =
7439 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7440 DAG, Subtarget, IsAfterLegalize);
7441 if (HalfLD)
7442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7443 HalfLD, DAG.getVectorIdxConstant(0, DL));
7444 }
7445 }
7446
7447 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7448 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7449 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7450 LoadSizeInBits == 64) &&
7451 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7452 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7453 : MVT::getIntegerVT(LoadSizeInBits);
7454 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7455 // Allow v4f32 on SSE1 only targets.
7456 // FIXME: Add more isel patterns so we can just use VT directly.
7457 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7458 VecVT = MVT::v4f32;
7459 if (TLI.isTypeLegal(VecVT)) {
7460 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7461 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7462 SDValue ResNode = DAG.getMemIntrinsicNode(
7463 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7465 for (auto *LD : Loads)
7466 if (LD)
7467 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7468 return DAG.getBitcast(VT, ResNode);
7469 }
7470 }
7471
7472 // BROADCAST - match the smallest possible repetition pattern, load that
7473 // scalar/subvector element and then broadcast to the entire vector.
7474 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7475 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7476 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7477 unsigned RepeatSize = SubElems * BaseSizeInBits;
7478 unsigned ScalarSize = std::min(RepeatSize, 64u);
7479 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7480 continue;
7481
7482 // Don't attempt a 1:N subvector broadcast - it should be caught by
7483 // combineConcatVectorOps, else will cause infinite loops.
7484 if (RepeatSize > ScalarSize && SubElems == 1)
7485 continue;
7486
7487 bool Match = true;
7488 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7489 for (unsigned i = 0; i != NumElems && Match; ++i) {
7490 if (!LoadMask[i])
7491 continue;
7492 SDValue Elt = peekThroughBitcasts(Elts[i]);
7493 if (RepeatedLoads[i % SubElems].isUndef())
7494 RepeatedLoads[i % SubElems] = Elt;
7495 else
7496 Match &= (RepeatedLoads[i % SubElems] == Elt);
7497 }
7498
7499 // We must have loads at both ends of the repetition.
7500 Match &= !RepeatedLoads.front().isUndef();
7501 Match &= !RepeatedLoads.back().isUndef();
7502 if (!Match)
7503 continue;
7504
7505 EVT RepeatVT =
7506 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7507 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7508 : EVT::getFloatingPointVT(ScalarSize);
7509 if (RepeatSize > ScalarSize)
7510 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7511 RepeatSize / ScalarSize);
7512 EVT BroadcastVT =
7513 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7514 VT.getSizeInBits() / ScalarSize);
7515 if (TLI.isTypeLegal(BroadcastVT)) {
7516 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7517 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7518 SDValue Broadcast = RepeatLoad;
7519 if (RepeatSize > ScalarSize) {
7520 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7521 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7522 } else {
7523 if (!Subtarget.hasAVX2() &&
7525 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7526 Subtarget,
7527 /*AssumeSingleUse=*/true))
7528 return SDValue();
7529 Broadcast =
7530 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7531 }
7532 return DAG.getBitcast(VT, Broadcast);
7533 }
7534 }
7535 }
7536 }
7537
7538 return SDValue();
7539}
7540
7541// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7542// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7543// are consecutive, non-overlapping, and in the right order.
7545 SelectionDAG &DAG,
7546 const X86Subtarget &Subtarget,
7547 bool IsAfterLegalize) {
7549 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7550 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7551 Elts.push_back(Elt);
7552 continue;
7553 }
7554 return SDValue();
7555 }
7556 assert(Elts.size() == VT.getVectorNumElements());
7557 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7558 IsAfterLegalize);
7559}
7560
7562 const APInt &Undefs, LLVMContext &C) {
7563 unsigned ScalarSize = VT.getScalarSizeInBits();
7564 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7565
7566 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7567 if (VT.isFloatingPoint()) {
7568 if (ScalarSize == 16)
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7570 if (ScalarSize == 32)
7571 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7572 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7573 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7574 }
7575 return Constant::getIntegerValue(Ty, Val);
7576 };
7577
7578 SmallVector<Constant *, 32> ConstantVec;
7579 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7580 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7581 : getConstantScalar(Bits[I]));
7582
7583 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7584}
7585
7586static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7587 unsigned SplatBitSize, LLVMContext &C) {
7588 unsigned ScalarSize = VT.getScalarSizeInBits();
7589
7590 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7591 if (VT.isFloatingPoint()) {
7592 if (ScalarSize == 16)
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7594 if (ScalarSize == 32)
7595 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7596 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7597 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7598 }
7599 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7600 };
7601
7602 if (ScalarSize == SplatBitSize)
7603 return getConstantScalar(SplatValue);
7604
7605 unsigned NumElm = SplatBitSize / ScalarSize;
7606 SmallVector<Constant *, 32> ConstantVec;
7607 for (unsigned I = 0; I != NumElm; ++I) {
7608 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7609 ConstantVec.push_back(getConstantScalar(Val));
7610 }
7611 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7612}
7613
7615 for (auto *U : N->users()) {
7616 unsigned Opc = U->getOpcode();
7617 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7618 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7619 return false;
7620 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7621 return false;
7622 if (isTargetShuffle(Opc))
7623 return true;
7624 if (Opc == ISD::BITCAST) // Ignore bitcasts
7625 return isFoldableUseOfShuffle(U);
7626 if (N->hasOneUse()) {
7627 // TODO, there may be some general way to know if a SDNode can
7628 // be folded. We now only know whether an MI is foldable.
7629 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7630 return false;
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637// If the node has a single use by a VSELECT then AVX512 targets may be able to
7638// fold as a predicated instruction.
7639static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7640 unsigned SizeInBits = V.getValueSizeInBits();
7641 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7642 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7643 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7644 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7645 return true;
7646 }
7647 }
7648 return false;
7649}
7650
7651/// Attempt to use the vbroadcast instruction to generate a splat value
7652/// from a splat BUILD_VECTOR which uses:
7653/// a. A single scalar load, or a constant.
7654/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7655///
7656/// The VBROADCAST node is returned when a pattern is found,
7657/// or SDValue() otherwise.
7659 const SDLoc &dl,
7660 const X86Subtarget &Subtarget,
7661 SelectionDAG &DAG) {
7662 // VBROADCAST requires AVX.
7663 // TODO: Splats could be generated for non-AVX CPUs using SSE
7664 // instructions, but there's less potential gain for only 128-bit vectors.
7665 if (!Subtarget.hasAVX())
7666 return SDValue();
7667
7668 MVT VT = BVOp->getSimpleValueType(0);
7669 unsigned NumElts = VT.getVectorNumElements();
7670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7672 "Unsupported vector type for broadcast.");
7673
7674 // See if the build vector is a repeating sequence of scalars (inc. splat).
7675 SDValue Ld;
7676 BitVector UndefElements;
7677 SmallVector<SDValue, 16> Sequence;
7678 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7679 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7680 if (Sequence.size() == 1)
7681 Ld = Sequence[0];
7682 }
7683
7684 // Attempt to use VBROADCASTM
7685 // From this pattern:
7686 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7687 // b. t1 = (build_vector t0 t0)
7688 //
7689 // Create (VBROADCASTM v2i1 X)
7690 if (!Sequence.empty() && Subtarget.hasCDI()) {
7691 // If not a splat, are the upper sequence values zeroable?
7692 unsigned SeqLen = Sequence.size();
7693 bool UpperZeroOrUndef =
7694 SeqLen == 1 ||
7695 llvm::all_of(ArrayRef(Sequence).drop_front(),
7696 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7697 SDValue Op0 = Sequence[0];
7698 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7699 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7700 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7701 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7702 ? Op0.getOperand(0)
7703 : Op0.getOperand(0).getOperand(0);
7704 MVT MaskVT = BOperand.getSimpleValueType();
7705 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7706 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7707 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7708 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7709 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7710 unsigned Scale = 512 / VT.getSizeInBits();
7711 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7712 }
7713 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7714 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7715 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7716 return DAG.getBitcast(VT, Bcst);
7717 }
7718 }
7719 }
7720
7721 unsigned NumUndefElts = UndefElements.count();
7722 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7723 APInt SplatValue, Undef;
7724 unsigned SplatBitSize;
7725 bool HasUndef;
7726 // Check if this is a repeated constant pattern suitable for broadcasting.
7727 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7728 SplatBitSize > VT.getScalarSizeInBits() &&
7729 SplatBitSize < VT.getSizeInBits()) {
7730 // Avoid replacing with broadcast when it's a use of a shuffle
7731 // instruction to preserve the present custom lowering of shuffles.
7732 if (isFoldableUseOfShuffle(BVOp))
7733 return SDValue();
7734 // replace BUILD_VECTOR with broadcast of the repeated constants.
7735 LLVMContext *Ctx = DAG.getContext();
7736 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7737 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7738 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7739 // Load the constant scalar/subvector and broadcast it.
7740 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7741 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7742 SDValue CP = DAG.getConstantPool(C, PVT);
7743 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7744
7745 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7746 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7747 SDValue Ops[] = {DAG.getEntryNode(), CP};
7748 MachinePointerInfo MPI =
7750 SDValue Brdcst =
7752 MPI, Alignment, MachineMemOperand::MOLoad);
7753 return DAG.getBitcast(VT, Brdcst);
7754 }
7755 if (SplatBitSize > 64) {
7756 // Load the vector of constants and broadcast it.
7757 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7758 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7759 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7760 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7761 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7762 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7763 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7764 MachinePointerInfo MPI =
7767 Ops, VVT, MPI, Alignment,
7769 }
7770 }
7771
7772 // If we are moving a scalar into a vector (Ld must be set and all elements
7773 // but 1 are undef) and that operation is not obviously supported by
7774 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7775 // That's better than general shuffling and may eliminate a load to GPR and
7776 // move from scalar to vector register.
7777 if (!Ld || NumElts - NumUndefElts != 1)
7778 return SDValue();
7779 unsigned ScalarSize = Ld.getValueSizeInBits();
7780 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7781 return SDValue();
7782 }
7783
7784 bool ConstSplatVal =
7785 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7786 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7787
7788 // TODO: Handle broadcasts of non-constant sequences.
7789
7790 // Make sure that all of the users of a non-constant load are from the
7791 // BUILD_VECTOR node.
7792 // FIXME: Is the use count needed for non-constant, non-load case?
7793 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7794 return SDValue();
7795
7796 unsigned ScalarSize = Ld.getValueSizeInBits();
7797 bool IsGE256 = (VT.getSizeInBits() >= 256);
7798
7799 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7800 // instruction to save 8 or more bytes of constant pool data.
7801 // TODO: If multiple splats are generated to load the same constant,
7802 // it may be detrimental to overall size. There needs to be a way to detect
7803 // that condition to know if this is truly a size win.
7804 bool OptForSize = DAG.shouldOptForSize();
7805
7806 // Handle broadcasting a single constant scalar from the constant pool
7807 // into a vector.
7808 // On Sandybridge (no AVX2), it is still better to load a constant vector
7809 // from the constant pool and not to broadcast it from a scalar.
7810 // But override that restriction when optimizing for size.
7811 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7812 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7813 EVT CVT = Ld.getValueType();
7814 assert(!CVT.isVector() && "Must not broadcast a vector type");
7815
7816 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7817 // For size optimization, also splat v2f64 and v2i64, and for size opt
7818 // with AVX2, also splat i8 and i16.
7819 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7820 if (ScalarSize == 32 ||
7821 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7822 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7823 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7824 const Constant *C = nullptr;
7826 C = CI->getConstantIntValue();
7828 C = CF->getConstantFPValue();
7829
7830 assert(C && "Invalid constant type");
7831
7832 SDValue CP =
7834 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7835
7836 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7837 SDValue Ops[] = {DAG.getEntryNode(), CP};
7838 MachinePointerInfo MPI =
7840 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7841 MPI, Alignment, MachineMemOperand::MOLoad);
7842 }
7843 }
7844
7845 // Handle AVX2 in-register broadcasts.
7846 if (!IsLoad && Subtarget.hasInt256() &&
7847 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7848 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7849
7850 // The scalar source must be a normal load.
7851 if (!IsLoad)
7852 return SDValue();
7853
7854 // Make sure the non-chain result is only used by this build vector.
7855 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7856 return SDValue();
7857
7858 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7859 (Subtarget.hasVLX() && ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7871 // double since there is no vbroadcastsd xmm
7872 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7873 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7874 auto *LN = cast<LoadSDNode>(Ld);
7875 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7876 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7877 SDValue BCast =
7879 LN->getMemoryVT(), LN->getMemOperand());
7880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7881 return BCast;
7882 }
7883
7884 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7886
7887 // Unsupported broadcast.
7888 return SDValue();
7889}
7890
7891/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7892/// underlying vector and index.
7893///
7894/// Modifies \p ExtractedFromVec to the real vector and returns the real
7895/// index.
7896static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7897 SDValue ExtIdx) {
7898 int Idx = ExtIdx->getAsZExtVal();
7899 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7900 return Idx;
7901
7902 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7903 // lowered this:
7904 // (extract_vector_elt (v8f32 %1), Constant<6>)
7905 // to:
7906 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7907 // (extract_subvector (v8f32 %0), Constant<4>),
7908 // undef)
7909 // Constant<0>)
7910 // In this case the vector is the extract_subvector expression and the index
7911 // is 2, as specified by the shuffle.
7912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7913 SDValue ShuffleVec = SVOp->getOperand(0);
7914 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7915 assert(ShuffleVecVT.getVectorElementType() ==
7916 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7917
7918 int ShuffleIdx = SVOp->getMaskElt(Idx);
7919 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7920 ExtractedFromVec = ShuffleVec;
7921 return ShuffleIdx;
7922 }
7923 return Idx;
7924}
7925
7927 SelectionDAG &DAG) {
7928 MVT VT = Op.getSimpleValueType();
7929
7930 // Skip if insert_vec_elt is not supported.
7931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7933 return SDValue();
7934
7935 unsigned NumElems = Op.getNumOperands();
7936 SDValue VecIn1;
7937 SDValue VecIn2;
7938 SmallVector<unsigned, 4> InsertIndices;
7939 SmallVector<int, 8> Mask(NumElems, -1);
7940
7941 for (unsigned i = 0; i != NumElems; ++i) {
7942 unsigned Opc = Op.getOperand(i).getOpcode();
7943
7944 if (Opc == ISD::UNDEF)
7945 continue;
7946
7948 // Quit if more than 1 elements need inserting.
7949 if (InsertIndices.size() > 1)
7950 return SDValue();
7951
7952 InsertIndices.push_back(i);
7953 continue;
7954 }
7955
7956 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7957 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7958
7959 // Quit if non-constant index.
7960 if (!isa<ConstantSDNode>(ExtIdx))
7961 return SDValue();
7962 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7963
7964 // Quit if extracted from vector of different type.
7965 if (ExtractedFromVec.getValueType() != VT)
7966 return SDValue();
7967
7968 if (!VecIn1.getNode())
7969 VecIn1 = ExtractedFromVec;
7970 else if (VecIn1 != ExtractedFromVec) {
7971 if (!VecIn2.getNode())
7972 VecIn2 = ExtractedFromVec;
7973 else if (VecIn2 != ExtractedFromVec)
7974 // Quit if more than 2 vectors to shuffle
7975 return SDValue();
7976 }
7977
7978 if (ExtractedFromVec == VecIn1)
7979 Mask[i] = Idx;
7980 else if (ExtractedFromVec == VecIn2)
7981 Mask[i] = Idx + NumElems;
7982 }
7983
7984 if (!VecIn1.getNode())
7985 return SDValue();
7986
7987 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7988 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7989
7990 for (unsigned Idx : InsertIndices)
7991 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7992 DAG.getVectorIdxConstant(Idx, DL));
7993
7994 return NV;
7995}
7996
7997// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7999 const X86Subtarget &Subtarget) {
8000 MVT VT = Op.getSimpleValueType();
8001 MVT IVT =
8002 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8004 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8005 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8006 Op.getOperand(I)));
8007 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8008 return DAG.getBitcast(VT, Res);
8009}
8010
8011// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8013 SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8015
8016 MVT VT = Op.getSimpleValueType();
8017 assert((VT.getVectorElementType() == MVT::i1) &&
8018 "Unexpected type in LowerBUILD_VECTORvXi1!");
8019 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8020 ISD::isBuildVectorAllOnes(Op.getNode()))
8021 return Op;
8022
8023 uint64_t Immediate = 0;
8024 SmallVector<unsigned, 16> NonConstIdx;
8025 bool IsSplat = true;
8026 bool HasConstElts = false;
8027 int SplatIdx = -1;
8028 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8029 SDValue In = Op.getOperand(idx);
8030 if (In.isUndef())
8031 continue;
8032 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8033 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8034 HasConstElts = true;
8035 } else {
8036 NonConstIdx.push_back(idx);
8037 }
8038 if (SplatIdx < 0)
8039 SplatIdx = idx;
8040 else if (In != Op.getOperand(SplatIdx))
8041 IsSplat = false;
8042 }
8043
8044 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8045 if (IsSplat) {
8046 // The build_vector allows the scalar element to be larger than the vector
8047 // element type. We need to mask it to use as a condition unless we know
8048 // the upper bits are zero.
8049 // FIXME: Use computeKnownBits instead of checking specific opcode?
8050 SDValue Cond = Op.getOperand(SplatIdx);
8051 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8052 if (Cond.getOpcode() != ISD::SETCC)
8053 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8054 DAG.getConstant(1, dl, MVT::i8));
8055
8056 // Perform the select in the scalar domain so we can use cmov.
8057 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8058 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8059 DAG.getAllOnesConstant(dl, MVT::i32),
8060 DAG.getConstant(0, dl, MVT::i32));
8061 Select = DAG.getBitcast(MVT::v32i1, Select);
8062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8063 } else {
8064 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8065 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8066 DAG.getAllOnesConstant(dl, ImmVT),
8067 DAG.getConstant(0, dl, ImmVT));
8068 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8069 Select = DAG.getBitcast(VecVT, Select);
8070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8071 DAG.getVectorIdxConstant(0, dl));
8072 }
8073 }
8074
8075 // insert elements one by one
8076 SDValue DstVec;
8077 if (HasConstElts) {
8078 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8079 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8080 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8081 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8082 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8083 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8084 } else {
8085 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8086 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8087 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8088 DstVec = DAG.getBitcast(VecVT, Imm);
8089 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8090 DAG.getVectorIdxConstant(0, dl));
8091 }
8092 } else
8093 DstVec = DAG.getUNDEF(VT);
8094
8095 for (unsigned InsertIdx : NonConstIdx) {
8096 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8097 Op.getOperand(InsertIdx),
8098 DAG.getVectorIdxConstant(InsertIdx, dl));
8099 }
8100 return DstVec;
8101}
8102
8103LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8104 switch (Opcode) {
8105 case X86ISD::PACKSS:
8106 case X86ISD::PACKUS:
8107 case X86ISD::FHADD:
8108 case X86ISD::FHSUB:
8109 case X86ISD::HADD:
8110 case X86ISD::HSUB:
8111 return true;
8112 }
8113 return false;
8114}
8115
8116/// This is a helper function of LowerToHorizontalOp().
8117/// This function checks that the build_vector \p N in input implements a
8118/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8119/// may not match the layout of an x86 256-bit horizontal instruction.
8120/// In other words, if this returns true, then some extraction/insertion will
8121/// be required to produce a valid horizontal instruction.
8122///
8123/// Parameter \p Opcode defines the kind of horizontal operation to match.
8124/// For example, if \p Opcode is equal to ISD::ADD, then this function
8125/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8126/// is equal to ISD::SUB, then this function checks if this is a horizontal
8127/// arithmetic sub.
8128///
8129/// This function only analyzes elements of \p N whose indices are
8130/// in range [BaseIdx, LastIdx).
8131///
8132/// TODO: This function was originally used to match both real and fake partial
8133/// horizontal operations, but the index-matching logic is incorrect for that.
8134/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8135/// code because it is only used for partial h-op matching now?
8136static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8137 const SDLoc &DL, SelectionDAG &DAG,
8138 unsigned BaseIdx, unsigned LastIdx,
8139 SDValue &V0, SDValue &V1) {
8140 EVT VT = N->getValueType(0);
8141 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8142 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8143 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8144 "Invalid Vector in input!");
8145
8146 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8147 bool CanFold = true;
8148 unsigned ExpectedVExtractIdx = BaseIdx;
8149 unsigned NumElts = LastIdx - BaseIdx;
8150 V0 = DAG.getUNDEF(VT);
8151 V1 = DAG.getUNDEF(VT);
8152
8153 // Check if N implements a horizontal binop.
8154 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8155 SDValue Op = N->getOperand(i + BaseIdx);
8156
8157 // Skip UNDEFs.
8158 if (Op->isUndef()) {
8159 // Update the expected vector extract index.
8160 if (i * 2 == NumElts)
8161 ExpectedVExtractIdx = BaseIdx;
8162 ExpectedVExtractIdx += 2;
8163 continue;
8164 }
8165
8166 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8167
8168 if (!CanFold)
8169 break;
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173
8174 // Try to match the following pattern:
8175 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8176 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8178 Op0.getOperand(0) == Op1.getOperand(0) &&
8181 if (!CanFold)
8182 break;
8183
8184 unsigned I0 = Op0.getConstantOperandVal(1);
8185 unsigned I1 = Op1.getConstantOperandVal(1);
8186
8187 if (i * 2 < NumElts) {
8188 if (V0.isUndef()) {
8189 V0 = Op0.getOperand(0);
8190 if (V0.getValueType() != VT)
8191 return false;
8192 }
8193 } else {
8194 if (V1.isUndef()) {
8195 V1 = Op0.getOperand(0);
8196 if (V1.getValueType() != VT)
8197 return false;
8198 }
8199 if (i * 2 == NumElts)
8200 ExpectedVExtractIdx = BaseIdx;
8201 }
8202
8203 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8204 if (I0 == ExpectedVExtractIdx)
8205 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8206 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8207 // Try to match the following dag sequence:
8208 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8209 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8210 } else
8211 CanFold = false;
8212
8213 ExpectedVExtractIdx += 2;
8214 }
8215
8216 return CanFold;
8217}
8218
8219/// Emit a sequence of two 128-bit horizontal add/sub followed by
8220/// a concat_vector.
8221///
8222/// This is a helper function of LowerToHorizontalOp().
8223/// This function expects two 256-bit vectors called V0 and V1.
8224/// At first, each vector is split into two separate 128-bit vectors.
8225/// Then, the resulting 128-bit vectors are used to implement two
8226/// horizontal binary operations.
8227///
8228/// The kind of horizontal binary operation is defined by \p X86Opcode.
8229///
8230/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8231/// the two new horizontal binop.
8232/// When Mode is set, the first horizontal binop dag node would take as input
8233/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8234/// horizontal binop dag node would take as input the lower 128-bit of V1
8235/// and the upper 128-bit of V1.
8236/// Example:
8237/// HADD V0_LO, V0_HI
8238/// HADD V1_LO, V1_HI
8239///
8240/// Otherwise, the first horizontal binop dag node takes as input the lower
8241/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8242/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8243/// Example:
8244/// HADD V0_LO, V1_LO
8245/// HADD V0_HI, V1_HI
8246///
8247/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8248/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8249/// the upper 128-bits of the result.
8250static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8251 const SDLoc &DL, SelectionDAG &DAG,
8252 unsigned X86Opcode, bool Mode,
8253 bool isUndefLO, bool isUndefHI) {
8254 MVT VT = V0.getSimpleValueType();
8255 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8256 "Invalid nodes in input!");
8257
8258 unsigned NumElts = VT.getVectorNumElements();
8259 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8260 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8261 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8262 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8263 MVT NewVT = V0_LO.getSimpleValueType();
8264
8265 SDValue LO = DAG.getUNDEF(NewVT);
8266 SDValue HI = DAG.getUNDEF(NewVT);
8267
8268 if (Mode) {
8269 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8270 if (!isUndefLO && !V0->isUndef())
8271 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8272 if (!isUndefHI && !V1->isUndef())
8273 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8274 } else {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8278
8279 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8280 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8281 }
8282
8283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8284}
8285
8286/// Returns true iff \p BV builds a vector with the result equivalent to
8287/// the result of ADDSUB/SUBADD operation.
8288/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8289/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8290/// \p Opnd0 and \p Opnd1.
8292 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8293 SDValue &Opnd0, SDValue &Opnd1,
8294 unsigned &NumExtracts, bool &IsSubAdd,
8295 bool &HasAllowContract) {
8296 using namespace SDPatternMatch;
8297
8298 MVT VT = BV->getSimpleValueType(0);
8299 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8300 return false;
8301
8302 unsigned NumElts = VT.getVectorNumElements();
8303 SDValue InVec0 = DAG.getUNDEF(VT);
8304 SDValue InVec1 = DAG.getUNDEF(VT);
8305
8306 NumExtracts = 0;
8307 HasAllowContract = NumElts != 0;
8308
8309 // Odd-numbered elements in the input build vector are obtained from
8310 // adding/subtracting two integer/float elements.
8311 // Even-numbered elements in the input build vector are obtained from
8312 // subtracting/adding two integer/float elements.
8313 unsigned Opc[2] = {0, 0};
8314 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8315 SDValue Op = BV->getOperand(i);
8316
8317 // Skip 'undef' values.
8318 unsigned Opcode = Op.getOpcode();
8319 if (Opcode == ISD::UNDEF)
8320 continue;
8321
8322 // Early exit if we found an unexpected opcode.
8323 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8324 return false;
8325
8326 SDValue Op0 = Op.getOperand(0);
8327 SDValue Op1 = Op.getOperand(1);
8328
8329 // Try to match the following pattern:
8330 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8331 // Early exit if we cannot match that sequence.
8332 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8333 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8334 return false;
8335
8336 // We found a valid add/sub node, make sure its the same opcode as previous
8337 // elements for this parity.
8338 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8339 return false;
8340 Opc[i % 2] = Opcode;
8341
8342 // Update InVec0 and InVec1.
8343 if (InVec0.isUndef())
8344 InVec0 = Op0.getOperand(0);
8345 if (InVec1.isUndef())
8346 InVec1 = Op1.getOperand(0);
8347
8348 // Make sure that operands in input to each add/sub node always
8349 // come from a same pair of vectors.
8350 if (InVec0 != Op0.getOperand(0)) {
8351 if (Opcode == ISD::FSUB)
8352 return false;
8353
8354 // FADD is commutable. Try to commute the operands
8355 // and then test again.
8356 std::swap(Op0, Op1);
8357 if (InVec0 != Op0.getOperand(0))
8358 return false;
8359 }
8360
8361 if (InVec1 != Op1.getOperand(0))
8362 return false;
8363
8364 // Increment the number of extractions done.
8365 ++NumExtracts;
8366 HasAllowContract &= Op->getFlags().hasAllowContract();
8367 }
8368
8369 // Ensure we have found an opcode for both parities and that they are
8370 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8371 // inputs are undef.
8372 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8373 InVec0.isUndef() || InVec1.isUndef())
8374 return false;
8375
8376 IsSubAdd = Opc[0] == ISD::FADD;
8377
8378 Opnd0 = InVec0;
8379 Opnd1 = InVec1;
8380 return true;
8381}
8382
8383/// Returns true if is possible to fold MUL and an idiom that has already been
8384/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8385/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8386/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8387///
8388/// Prior to calling this function it should be known that there is some
8389/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8390/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8391/// before replacement of such SDNode with ADDSUB operation. Thus the number
8392/// of \p Opnd0 uses is expected to be equal to 2.
8393/// For example, this function may be called for the following IR:
8394/// %AB = fmul fast <2 x double> %A, %B
8395/// %Sub = fsub fast <2 x double> %AB, %C
8396/// %Add = fadd fast <2 x double> %AB, %C
8397/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8398/// <2 x i32> <i32 0, i32 3>
8399/// There is a def for %Addsub here, which potentially can be replaced by
8400/// X86ISD::ADDSUB operation:
8401/// %Addsub = X86ISD::ADDSUB %AB, %C
8402/// and such ADDSUB can further be replaced with FMADDSUB:
8403/// %Addsub = FMADDSUB %A, %B, %C.
8404///
8405/// The main reason why this method is called before the replacement of the
8406/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8407/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8408/// FMADDSUB is.
8409static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8410 SelectionDAG &DAG, SDValue &Opnd0,
8411 SDValue &Opnd1, SDValue &Opnd2,
8412 unsigned ExpectedUses,
8413 bool AllowSubAddOrAddSubContract) {
8414 if (Opnd0.getOpcode() != ISD::FMUL ||
8415 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8416 return false;
8417
8418 // FIXME: These checks must match the similar ones in
8419 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8420 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8421 // or MUL + ADDSUB to FMADDSUB.
8422 const TargetOptions &Options = DAG.getTarget().Options;
8423 bool AllowFusion =
8424 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8425 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8426 if (!AllowFusion)
8427 return false;
8428
8429 Opnd2 = Opnd1;
8430 Opnd1 = Opnd0.getOperand(1);
8431 Opnd0 = Opnd0.getOperand(0);
8432
8433 return true;
8434}
8435
8436/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8437/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8438/// X86ISD::FMSUBADD node.
8440 const SDLoc &DL,
8441 const X86Subtarget &Subtarget,
8442 SelectionDAG &DAG) {
8443 SDValue Opnd0, Opnd1;
8444 unsigned NumExtracts;
8445 bool IsSubAdd;
8446 bool HasAllowContract;
8447 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8448 HasAllowContract))
8449 return SDValue();
8450
8451 MVT VT = BV->getSimpleValueType(0);
8452
8453 // Try to generate X86ISD::FMADDSUB node here.
8454 SDValue Opnd2;
8455 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8456 HasAllowContract)) {
8457 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8458 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8459 }
8460
8461 // We only support ADDSUB.
8462 if (IsSubAdd)
8463 return SDValue();
8464
8465 // There are no known X86 targets with 512-bit ADDSUB instructions!
8466 // Convert to blend(fsub,fadd).
8467 if (VT.is512BitVector()) {
8468 SmallVector<int> Mask;
8469 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8470 Mask.push_back(I);
8471 Mask.push_back(I + E + 1);
8472 }
8473 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8474 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8475 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8476 }
8477
8478 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8479}
8480
8482 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8483 // Initialize outputs to known values.
8484 MVT VT = BV->getSimpleValueType(0);
8485 HOpcode = ISD::DELETED_NODE;
8486 V0 = DAG.getUNDEF(VT);
8487 V1 = DAG.getUNDEF(VT);
8488
8489 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8490 // half of the result is calculated independently from the 128-bit halves of
8491 // the inputs, so that makes the index-checking logic below more complicated.
8492 unsigned NumElts = VT.getVectorNumElements();
8493 unsigned GenericOpcode = ISD::DELETED_NODE;
8494 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8495 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8496 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8497 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8498 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8499 // Ignore undef elements.
8500 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8501 if (Op.isUndef())
8502 continue;
8503
8504 // If there's an opcode mismatch, we're done.
8505 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8506 return false;
8507
8508 // Initialize horizontal opcode.
8509 if (HOpcode == ISD::DELETED_NODE) {
8510 GenericOpcode = Op.getOpcode();
8511 switch (GenericOpcode) {
8512 // clang-format off
8513 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8514 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8515 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8516 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8517 default: return false;
8518 // clang-format on
8519 }
8520 }
8521
8522 SDValue Op0 = Op.getOperand(0);
8523 SDValue Op1 = Op.getOperand(1);
8524 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8526 Op0.getOperand(0) != Op1.getOperand(0) ||
8528 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8529 return false;
8530
8531 // The source vector is chosen based on which 64-bit half of the
8532 // destination vector is being calculated.
8533 if (j < NumEltsIn64Bits) {
8534 if (V0.isUndef())
8535 V0 = Op0.getOperand(0);
8536 } else {
8537 if (V1.isUndef())
8538 V1 = Op0.getOperand(0);
8539 }
8540
8541 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8542 if (SourceVec != Op0.getOperand(0))
8543 return false;
8544
8545 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8546 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8547 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8548 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8549 (j % NumEltsIn64Bits) * 2;
8550 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8551 continue;
8552
8553 // If this is not a commutative op, this does not match.
8554 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8555 return false;
8556
8557 // Addition is commutative, so try swapping the extract indexes.
8558 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8559 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8560 continue;
8561
8562 // Extract indexes do not match horizontal requirement.
8563 return false;
8564 }
8565 }
8566 // We matched. Opcode and operands are returned by reference as arguments.
8567 return true;
8568}
8569
8571 const SDLoc &DL, SelectionDAG &DAG,
8572 unsigned HOpcode, SDValue V0, SDValue V1) {
8573 // If either input vector is not the same size as the build vector,
8574 // extract/insert the low bits to the correct size.
8575 // This is free (examples: zmm --> xmm, xmm --> ymm).
8576 MVT VT = BV->getSimpleValueType(0);
8577 unsigned Width = VT.getSizeInBits();
8578 if (V0.getValueSizeInBits() > Width)
8579 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8580 else if (V0.getValueSizeInBits() < Width)
8581 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8582
8583 if (V1.getValueSizeInBits() > Width)
8584 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8585 else if (V1.getValueSizeInBits() < Width)
8586 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8587
8588 unsigned NumElts = VT.getVectorNumElements();
8589 APInt DemandedElts = APInt::getAllOnes(NumElts);
8590 for (unsigned i = 0; i != NumElts; ++i)
8591 if (BV->getOperand(i).isUndef())
8592 DemandedElts.clearBit(i);
8593
8594 // If we don't need the upper xmm, then perform as a xmm hop.
8595 unsigned HalfNumElts = NumElts / 2;
8596 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8597 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8598 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8599 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8600 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8601 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8602 }
8603
8604 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8605}
8606
8607/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8609 const X86Subtarget &Subtarget,
8610 SelectionDAG &DAG) {
8611 // We need at least 2 non-undef elements to make this worthwhile by default.
8612 unsigned NumNonUndefs =
8613 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8614 if (NumNonUndefs < 2)
8615 return SDValue();
8616
8617 // There are 4 sets of horizontal math operations distinguished by type:
8618 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8619 // subtarget feature. Try to match those "native" patterns first.
8620 MVT VT = BV->getSimpleValueType(0);
8621 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8622 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8623 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8624 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8625 unsigned HOpcode;
8626 SDValue V0, V1;
8627 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8628 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8629 }
8630
8631 // Try harder to match 256-bit ops by using extract/concat.
8632 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8633 return SDValue();
8634
8635 // Count the number of UNDEF operands in the build_vector in input.
8636 unsigned NumElts = VT.getVectorNumElements();
8637 unsigned Half = NumElts / 2;
8638 unsigned NumUndefsLO = 0;
8639 unsigned NumUndefsHI = 0;
8640 for (unsigned i = 0, e = Half; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsLO++;
8643
8644 for (unsigned i = Half, e = NumElts; i != e; ++i)
8645 if (BV->getOperand(i)->isUndef())
8646 NumUndefsHI++;
8647
8648 SDValue InVec0, InVec1;
8649 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8650 SDValue InVec2, InVec3;
8651 unsigned X86Opcode;
8652 bool CanFold = true;
8653
8654 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8655 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8656 InVec3) &&
8657 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8658 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8659 X86Opcode = X86ISD::HADD;
8660 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8661 InVec1) &&
8662 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8663 InVec3) &&
8664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8666 X86Opcode = X86ISD::HSUB;
8667 else
8668 CanFold = false;
8669
8670 if (CanFold) {
8671 // Do not try to expand this build_vector into a pair of horizontal
8672 // add/sub if we can emit a pair of scalar add/sub.
8673 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8674 return SDValue();
8675
8676 // Convert this build_vector into a pair of horizontal binops followed by
8677 // a concat vector. We must adjust the outputs from the partial horizontal
8678 // matching calls above to account for undefined vector halves.
8679 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8680 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8681 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8682 bool isUndefLO = NumUndefsLO == Half;
8683 bool isUndefHI = NumUndefsHI == Half;
8684 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8685 isUndefHI);
8686 }
8687 }
8688
8689 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8690 VT == MVT::v16i16) {
8691 unsigned X86Opcode;
8692 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HADD;
8695 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::HSUB;
8698 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::FHSUB;
8704 else
8705 return SDValue();
8706
8707 // Don't try to expand this build_vector into a pair of horizontal add/sub
8708 // if we can simply emit a pair of scalar add/sub.
8709 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8710 return SDValue();
8711
8712 // Convert this build_vector into two horizontal add/sub followed by
8713 // a concat vector.
8714 bool isUndefLO = NumUndefsLO == Half;
8715 bool isUndefHI = NumUndefsHI == Half;
8716 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8717 isUndefLO, isUndefHI);
8718 }
8719
8720 return SDValue();
8721}
8722
8723static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG);
8725
8726/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8727/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8728/// just apply the bit to the vectors.
8729/// NOTE: Its not in our interest to start make a general purpose vectorizer
8730/// from this, but enough scalar bit operations are created from the later
8731/// legalization + scalarization stages to need basic support.
8733 const X86Subtarget &Subtarget,
8734 SelectionDAG &DAG) {
8735 MVT VT = Op->getSimpleValueType(0);
8736 unsigned NumElems = VT.getVectorNumElements();
8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8738
8739 // Check that all elements have the same opcode.
8740 // TODO: Should we allow UNDEFS and if so how many?
8741 unsigned Opcode = Op->getOperand(0).getOpcode();
8742 for (unsigned i = 1; i < NumElems; ++i)
8743 if (Opcode != Op->getOperand(i).getOpcode())
8744 return SDValue();
8745
8746 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8747 bool IsShift = false;
8748 switch (Opcode) {
8749 default:
8750 return SDValue();
8751 case ISD::SHL:
8752 case ISD::SRL:
8753 case ISD::SRA:
8754 IsShift = true;
8755 break;
8756 case ISD::AND:
8757 case ISD::XOR:
8758 case ISD::OR:
8759 // Don't do this if the buildvector is a splat - we'd replace one
8760 // constant with an entire vector.
8761 if (Op->getSplatValue())
8762 return SDValue();
8763 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8764 return SDValue();
8765 break;
8766 }
8767
8768 SmallVector<SDValue, 4> LHSElts, RHSElts;
8769 for (SDValue Elt : Op->ops()) {
8770 SDValue LHS = Elt.getOperand(0);
8771 SDValue RHS = Elt.getOperand(1);
8772
8773 // We expect the canonicalized RHS operand to be the constant.
8775 return SDValue();
8776
8777 // Extend shift amounts.
8778 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8779 if (!IsShift)
8780 return SDValue();
8781 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8782 }
8783
8784 LHSElts.push_back(LHS);
8785 RHSElts.push_back(RHS);
8786 }
8787
8788 // Limit to shifts by uniform immediates.
8789 // TODO: Only accept vXi8/vXi64 special cases?
8790 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8791 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8792 return SDValue();
8793
8794 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8795 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8796 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8797
8798 if (!IsShift)
8799 return Res;
8800
8801 // Immediately lower the shift to ensure the constant build vector doesn't
8802 // get converted to a constant pool before the shift is lowered.
8803 return LowerShift(Res, Subtarget, DAG);
8804}
8805
8806static bool isShuffleFoldableLoad(SDValue);
8807
8808/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8809/// representing a blend.
8811 X86Subtarget const &Subtarget,
8812 SelectionDAG &DAG) {
8813 MVT VT = BVOp->getSimpleValueType(0u);
8814
8815 if (VT != MVT::v4f64)
8816 return SDValue();
8817
8818 // Collect unique operands.
8819 auto UniqueOps = SmallSet<SDValue, 16u>();
8820 for (SDValue Op : BVOp->ops()) {
8821 if (isIntOrFPConstant(Op) || Op.isUndef())
8822 return SDValue();
8823 UniqueOps.insert(Op);
8824 }
8825
8826 // Candidate BUILD_VECTOR must have 2 unique operands.
8827 if (UniqueOps.size() != 2u)
8828 return SDValue();
8829
8830 SDValue Op0 = BVOp->getOperand(0u);
8831 UniqueOps.erase(Op0);
8832 SDValue Op1 = *UniqueOps.begin();
8833
8834 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8835 isShuffleFoldableLoad(Op1)) {
8836 // Create shuffle mask.
8837 auto const NumElems = VT.getVectorNumElements();
8838 SmallVector<int, 16u> Mask(NumElems);
8839 for (auto I = 0u; I < NumElems; ++I) {
8840 SDValue Op = BVOp->getOperand(I);
8841 Mask[I] = Op == Op0 ? I : I + NumElems;
8842 }
8843 // Create shuffle of splats.
8844 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8845 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8846 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8847 }
8848
8849 return SDValue();
8850}
8851
8852/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8853/// functionality to do this, so it's all zeros, all ones, or some derivation
8854/// that is cheap to calculate.
8856 SelectionDAG &DAG,
8857 const X86Subtarget &Subtarget) {
8858 MVT VT = Op.getSimpleValueType();
8859
8860 // Vectors containing all zeros can be matched by pxor and xorps.
8861 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8862 return Op;
8863
8864 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8865 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8866 // vpcmpeqd on 256-bit vectors.
8867 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8868 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8869 return Op;
8870
8871 return getOnesVector(VT, DAG, DL);
8872 }
8873
8874 return SDValue();
8875}
8876
8877/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8878/// from a vector of source values and a vector of extraction indices.
8879/// The vectors might be manipulated to match the type of the permute op.
8880static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8881 const SDLoc &DL, SelectionDAG &DAG,
8882 const X86Subtarget &Subtarget) {
8883 MVT ShuffleVT = VT;
8884 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8885 unsigned NumElts = VT.getVectorNumElements();
8886 unsigned SizeInBits = VT.getSizeInBits();
8887
8888 // Adjust IndicesVec to match VT size.
8889 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8890 "Illegal variable permute mask size");
8891 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8892 // Narrow/widen the indices vector to the correct size.
8893 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8894 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8895 NumElts * VT.getScalarSizeInBits());
8896 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8897 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8898 SDLoc(IndicesVec), SizeInBits);
8899 // Zero-extend the index elements within the vector.
8900 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8901 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8902 IndicesVT, IndicesVec);
8903 }
8904 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8905
8906 // Handle SrcVec that don't match VT type.
8907 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8908 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8909 // Handle larger SrcVec by treating it as a larger permute.
8910 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8911 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8912 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8913 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8914 Subtarget, DAG, SDLoc(IndicesVec));
8915 SDValue NewSrcVec =
8916 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8917 if (NewSrcVec)
8918 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8919 return SDValue();
8920 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8921 // Widen smaller SrcVec to match VT.
8922 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8923 } else
8924 return SDValue();
8925 }
8926
8927 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8928 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8929 EVT SrcVT = Idx.getValueType();
8930 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8931 uint64_t IndexScale = 0;
8932 uint64_t IndexOffset = 0;
8933
8934 // If we're scaling a smaller permute op, then we need to repeat the
8935 // indices, scaling and offsetting them as well.
8936 // e.g. v4i32 -> v16i8 (Scale = 4)
8937 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8938 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8939 for (uint64_t i = 0; i != Scale; ++i) {
8940 IndexScale |= Scale << (i * NumDstBits);
8941 IndexOffset |= i << (i * NumDstBits);
8942 }
8943
8944 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8945 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8946 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8947 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8948 return Idx;
8949 };
8950
8951 unsigned Opcode = 0;
8952 switch (VT.SimpleTy) {
8953 default:
8954 break;
8955 case MVT::v16i8:
8956 if (Subtarget.hasSSSE3())
8957 Opcode = X86ISD::PSHUFB;
8958 break;
8959 case MVT::v8i16:
8960 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8961 Opcode = X86ISD::VPERMV;
8962 else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v4f32:
8968 case MVT::v4i32:
8969 if (Subtarget.hasAVX()) {
8970 Opcode = X86ISD::VPERMILPV;
8971 ShuffleVT = MVT::v4f32;
8972 } else if (Subtarget.hasSSSE3()) {
8973 Opcode = X86ISD::PSHUFB;
8974 ShuffleVT = MVT::v16i8;
8975 }
8976 break;
8977 case MVT::v2f64:
8978 case MVT::v2i64:
8979 if (Subtarget.hasAVX()) {
8980 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8981 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8982 Opcode = X86ISD::VPERMILPV;
8983 ShuffleVT = MVT::v2f64;
8984 } else if (Subtarget.hasSSE41()) {
8985 // SSE41 can compare v2i64 - select between indices 0 and 1.
8986 return DAG.getSelectCC(
8987 DL, IndicesVec,
8988 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8990 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8992 }
8993 break;
8994 case MVT::v32i8:
8995 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8996 Opcode = X86ISD::VPERMV;
8997 else if (Subtarget.hasXOP()) {
8998 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8999 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9000 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9001 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9002 return DAG.getNode(
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9005 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9006 } else if (Subtarget.hasAVX()) {
9007 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9008 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9009 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9010 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9011 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9013 // Permute Lo and Hi and then select based on index range.
9014 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9015 // care about the bit[7] as its just an index vector.
9016 SDValue Idx = Ops[2];
9017 EVT VT = Idx.getValueType();
9018 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9020 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9022 };
9023 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9024 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9025 PSHUFBBuilder);
9026 }
9027 break;
9028 case MVT::v16i16:
9029 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9030 Opcode = X86ISD::VPERMV;
9031 else if (Subtarget.hasAVX()) {
9032 // Scale to v32i8 and perform as v32i8.
9033 IndicesVec = ScaleIndices(IndicesVec, 2);
9034 return DAG.getBitcast(
9036 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9037 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9038 }
9039 break;
9040 case MVT::v8f32:
9041 case MVT::v8i32:
9042 if (Subtarget.hasAVX2())
9043 Opcode = X86ISD::VPERMV;
9044 else if (Subtarget.hasAVX()) {
9045 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9046 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9047 {0, 1, 2, 3, 0, 1, 2, 3});
9048 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9049 {4, 5, 6, 7, 4, 5, 6, 7});
9050 if (Subtarget.hasXOP())
9051 return DAG.getBitcast(
9052 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9053 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9054 // Permute Lo and Hi and then select based on index range.
9055 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9056 SDValue Res = DAG.getSelectCC(
9057 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9059 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9061 return DAG.getBitcast(VT, Res);
9062 }
9063 break;
9064 case MVT::v4i64:
9065 case MVT::v4f64:
9066 if (Subtarget.hasAVX512()) {
9067 if (!Subtarget.hasVLX()) {
9068 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9069 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9070 SDLoc(SrcVec));
9071 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9072 DAG, SDLoc(IndicesVec));
9073 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9074 DAG, Subtarget);
9075 return extract256BitVector(Res, 0, DAG, DL);
9076 }
9077 Opcode = X86ISD::VPERMV;
9078 } else if (Subtarget.hasAVX()) {
9079 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9080 SDValue LoLo =
9081 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9082 SDValue HiHi =
9083 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9084 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9085 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9086 if (Subtarget.hasXOP())
9087 return DAG.getBitcast(
9088 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9089 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9090 // Permute Lo and Hi and then select based on index range.
9091 // This works as VPERMILPD only uses index bit[1] to permute elements.
9092 SDValue Res = DAG.getSelectCC(
9093 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9095 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9097 return DAG.getBitcast(VT, Res);
9098 }
9099 break;
9100 case MVT::v64i8:
9101 if (Subtarget.hasVBMI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v32i16:
9105 if (Subtarget.hasBWI())
9106 Opcode = X86ISD::VPERMV;
9107 break;
9108 case MVT::v16f32:
9109 case MVT::v16i32:
9110 case MVT::v8f64:
9111 case MVT::v8i64:
9112 if (Subtarget.hasAVX512())
9113 Opcode = X86ISD::VPERMV;
9114 break;
9115 }
9116 if (!Opcode)
9117 return SDValue();
9118
9119 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9120 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9121 "Illegal variable permute shuffle type");
9122
9123 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9124 if (Scale > 1)
9125 IndicesVec = ScaleIndices(IndicesVec, Scale);
9126
9127 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9128 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9129
9130 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9131 SDValue Res = Opcode == X86ISD::VPERMV
9132 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9133 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9134 return DAG.getBitcast(VT, Res);
9135}
9136
9137// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9138// reasoned to be a permutation of a vector by indices in a non-constant vector.
9139// (build_vector (extract_elt V, (extract_elt I, 0)),
9140// (extract_elt V, (extract_elt I, 1)),
9141// ...
9142// ->
9143// (vpermv I, V)
9144//
9145// TODO: Handle undefs
9146// TODO: Utilize pshufb and zero mask blending to support more efficient
9147// construction of vectors with constant-0 elements.
9148static SDValue
9150 SelectionDAG &DAG,
9151 const X86Subtarget &Subtarget) {
9152 SDValue SrcVec, IndicesVec;
9153
9154 auto PeekThroughFreeze = [](SDValue N) {
9155 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9156 return N->getOperand(0);
9157 return N;
9158 };
9159 // Check for a match of the permute source vector and permute index elements.
9160 // This is done by checking that the i-th build_vector operand is of the form:
9161 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9162 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9163 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9164 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9165 return SDValue();
9166
9167 // If this is the first extract encountered in V, set the source vector,
9168 // otherwise verify the extract is from the previously defined source
9169 // vector.
9170 if (!SrcVec)
9171 SrcVec = Op.getOperand(0);
9172 else if (SrcVec != Op.getOperand(0))
9173 return SDValue();
9174 SDValue ExtractedIndex = Op->getOperand(1);
9175 // Peek through extends.
9176 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9177 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9178 ExtractedIndex = ExtractedIndex.getOperand(0);
9179 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9180 return SDValue();
9181
9182 // If this is the first extract from the index vector candidate, set the
9183 // indices vector, otherwise verify the extract is from the previously
9184 // defined indices vector.
9185 if (!IndicesVec)
9186 IndicesVec = ExtractedIndex.getOperand(0);
9187 else if (IndicesVec != ExtractedIndex.getOperand(0))
9188 return SDValue();
9189
9190 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9191 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9192 return SDValue();
9193 }
9194
9195 MVT VT = V.getSimpleValueType();
9196 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9197}
9198
9199SDValue
9200X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9201 SDLoc dl(Op);
9202
9203 MVT VT = Op.getSimpleValueType();
9204 MVT EltVT = VT.getVectorElementType();
9205 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9206 unsigned NumElems = Op.getNumOperands();
9207
9208 // Generate vectors for predicate vectors.
9209 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9210 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9211
9212 if (VT.getVectorElementType() == MVT::bf16 &&
9213 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9214 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9215
9216 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9217 return VectorCst;
9218
9219 unsigned EVTBits = EltVT.getSizeInBits();
9220 APInt UndefMask = APInt::getZero(NumElems);
9221 APInt FrozenUndefMask = APInt::getZero(NumElems);
9222 APInt ZeroMask = APInt::getZero(NumElems);
9223 APInt NonZeroMask = APInt::getZero(NumElems);
9224 bool IsAllConstants = true;
9225 bool OneUseFrozenUndefs = true;
9226 SmallSet<SDValue, 8> Values;
9227 unsigned NumConstants = NumElems;
9228 for (unsigned i = 0; i < NumElems; ++i) {
9229 SDValue Elt = Op.getOperand(i);
9230 if (Elt.isUndef()) {
9231 UndefMask.setBit(i);
9232 continue;
9233 }
9234 if (ISD::isFreezeUndef(Elt.getNode())) {
9235 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9236 FrozenUndefMask.setBit(i);
9237 continue;
9238 }
9239 Values.insert(Elt);
9240 if (!isIntOrFPConstant(Elt)) {
9241 IsAllConstants = false;
9242 NumConstants--;
9243 }
9244 if (X86::isZeroNode(Elt)) {
9245 ZeroMask.setBit(i);
9246 } else {
9247 NonZeroMask.setBit(i);
9248 }
9249 }
9250
9251 // All undef vector. Return an UNDEF.
9252 if (UndefMask.isAllOnes())
9253 return DAG.getUNDEF(VT);
9254
9255 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9256 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9257 return DAG.getFreeze(DAG.getUNDEF(VT));
9258
9259 // All undef/freeze(undef)/zero vector. Return a zero vector.
9260 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9261 return getZeroVector(VT, Subtarget, DAG, dl);
9262
9263 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9264 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9265 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9266 // and blend the FREEZE-UNDEF operands back in.
9267 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9268 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9269 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9270 SmallVector<int, 16> BlendMask(NumElems, -1);
9271 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9272 for (unsigned i = 0; i < NumElems; ++i) {
9273 if (UndefMask[i]) {
9274 BlendMask[i] = -1;
9275 continue;
9276 }
9277 BlendMask[i] = i;
9278 if (!FrozenUndefMask[i])
9279 Elts[i] = Op.getOperand(i);
9280 else
9281 BlendMask[i] += NumElems;
9282 }
9283 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9284 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9285 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9286 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9287 }
9288
9289 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9290
9291 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9292 // be better off lowering to a smaller build vector and padding with
9293 // undef/zero.
9294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9296 unsigned UpperElems = NumElems / 2;
9297 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9299 if (NumUpperUndefsOrZeros >= UpperElems) {
9300 if (VT.is512BitVector() &&
9301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9302 UpperElems = NumElems - (NumElems / 4);
9303 // If freeze(undef) is in any upper elements, force to zero.
9304 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9305 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9306 SDValue NewBV =
9307 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9308 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9309 }
9310 }
9311
9312 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9313 return AddSub;
9314 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9315 return HorizontalOp;
9316 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9317 return Broadcast;
9318 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9319 return BitOp;
9320 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9321 return Blend;
9322
9323 unsigned NumZero = ZeroMask.popcount();
9324 unsigned NumNonZero = NonZeroMask.popcount();
9325
9326 // If we are inserting one variable into a vector of non-zero constants, try
9327 // to avoid loading each constant element as a scalar. Load the constants as a
9328 // vector and then insert the variable scalar element. If insertion is not
9329 // supported, fall back to a shuffle to get the scalar blended with the
9330 // constants. Insertion into a zero vector is handled as a special-case
9331 // somewhere below here.
9332 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9333 FrozenUndefMask.isZero() &&
9336 // Create an all-constant vector. The variable element in the old
9337 // build vector is replaced by undef in the constant vector. Save the
9338 // variable scalar element and its index for use in the insertelement.
9339 LLVMContext &Context = *DAG.getContext();
9340 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9341 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9342 SDValue VarElt;
9343 SDValue InsIndex;
9344 for (unsigned i = 0; i != NumElems; ++i) {
9345 SDValue Elt = Op.getOperand(i);
9346 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9347 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9348 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9349 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9350 else if (!Elt.isUndef()) {
9351 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9352 "Expected one variable element in this vector");
9353 VarElt = Elt;
9354 InsIndex = DAG.getVectorIdxConstant(i, dl);
9355 }
9356 }
9357 Constant *CV = ConstantVector::get(ConstVecOps);
9358 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9359
9360 // The constants we just created may not be legal (eg, floating point). We
9361 // must lower the vector right here because we can not guarantee that we'll
9362 // legalize it before loading it. This is also why we could not just create
9363 // a new build vector here. If the build vector contains illegal constants,
9364 // it could get split back up into a series of insert elements.
9365 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9366 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9367 MachineFunction &MF = DAG.getMachineFunction();
9368 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9369 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9370 unsigned InsertC = InsIndex->getAsZExtVal();
9371 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9372 if (InsertC < NumEltsInLow128Bits)
9373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9374
9375 // There's no good way to insert into the high elements of a >128-bit
9376 // vector, so use shuffles to avoid an extract/insert sequence.
9377 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9378 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9379 SmallVector<int, 8> ShuffleMask;
9380 unsigned NumElts = VT.getVectorNumElements();
9381 for (unsigned i = 0; i != NumElts; ++i)
9382 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9383 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9384 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9385 }
9386
9387 // Special case for single non-zero, non-undef, element.
9388 if (NumNonZero == 1) {
9389 unsigned Idx = NonZeroMask.countr_zero();
9390 SDValue Item = Op.getOperand(Idx);
9391
9392 // If we have a constant or non-constant insertion into the low element of
9393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9394 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9395 // depending on what the source datatype is.
9396 if (Idx == 0) {
9397 if (NumZero == 0)
9398 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9399
9400 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9401 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9402 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9403 assert((VT.is128BitVector() || VT.is256BitVector() ||
9404 VT.is512BitVector()) &&
9405 "Expected an SSE value type!");
9406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9407 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9408 // zero vector.
9409 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9410 }
9411
9412 // We can't directly insert an i8 or i16 into a vector, so zero extend
9413 // it to i32 first.
9414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9415 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9419 return DAG.getBitcast(VT, Item);
9420 }
9421 }
9422
9423 // Is it a vector logical left shift?
9424 if (NumElems == 2 && Idx == 1 &&
9425 X86::isZeroNode(Op.getOperand(0)) &&
9426 !X86::isZeroNode(Op.getOperand(1))) {
9427 unsigned NumBits = VT.getSizeInBits();
9428 return getVShift(true, VT,
9430 VT, Op.getOperand(1)),
9431 NumBits/2, DAG, *this, dl);
9432 }
9433
9434 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9435 return SDValue();
9436
9437 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9438 // is a non-constant being inserted into an element other than the low one,
9439 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9440 // movd/movss) to move this into the low element, then shuffle it into
9441 // place.
9442 if (EVTBits == 32) {
9443 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9444 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9445 }
9446 }
9447
9448 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9449 if (Values.size() == 1) {
9450 if (EVTBits == 32) {
9451 // Instead of a shuffle like this:
9452 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9453 // Check if it's possible to issue this instead.
9454 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9455 unsigned Idx = NonZeroMask.countr_zero();
9456 SDValue Item = Op.getOperand(Idx);
9457 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9458 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9459 }
9460 return SDValue();
9461 }
9462
9463 // A vector full of immediates; various special cases are already
9464 // handled, so this is best done with a single constant-pool load.
9465 if (IsAllConstants)
9466 return SDValue();
9467
9468 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9469 return V;
9470
9471 // See if we can use a vector load to get all of the elements.
9472 {
9473 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9474 if (SDValue LD =
9475 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9476 return LD;
9477 }
9478
9479 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9480 // build_vector and broadcast it.
9481 // TODO: We could probably generalize this more.
9482 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9483 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9484 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9485 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9486 // Make sure all the even/odd operands match.
9487 for (unsigned i = 2; i != NumElems; ++i)
9488 if (Ops[i % 2] != Op.getOperand(i))
9489 return false;
9490 return true;
9491 };
9492 if (CanSplat(Op, NumElems, Ops)) {
9493 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9494 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9495 // Create a new build vector and cast to v2i64/v2f64.
9496 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9497 DAG.getBuildVector(NarrowVT, dl, Ops));
9498 // Broadcast from v2i64/v2f64 and cast to final VT.
9499 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9500 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9501 NewBV));
9502 }
9503 }
9504
9505 // For AVX-length vectors, build the individual 128-bit pieces and use
9506 // shuffles to put them in place.
9507 if (VT.getSizeInBits() > 128) {
9508 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9509
9510 // Build both the lower and upper subvector.
9511 SDValue Lower =
9512 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9514 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9515
9516 // Recreate the wider vector with the lower and upper part.
9517 return concatSubVectors(Lower, Upper, DAG, dl);
9518 }
9519
9520 // Let legalizer expand 2-wide build_vectors.
9521 if (EVTBits == 64) {
9522 if (NumNonZero == 1) {
9523 // One half is zero or undef.
9524 unsigned Idx = NonZeroMask.countr_zero();
9525 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9526 Op.getOperand(Idx));
9527 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9528 }
9529 return SDValue();
9530 }
9531
9532 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9533 if (EVTBits == 8 && NumElems == 16)
9534 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9535 NumZero, DAG, Subtarget))
9536 return V;
9537
9538 if (EltVT == MVT::i16 && NumElems == 8)
9539 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9540 NumZero, DAG, Subtarget))
9541 return V;
9542
9543 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9544 if (EVTBits == 32 && NumElems == 4)
9545 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9546 return V;
9547
9548 // If element VT is == 32 bits, turn it into a number of shuffles.
9549 if (NumElems == 4 && NumZero > 0) {
9550 SmallVector<SDValue, 8> Ops(NumElems);
9551 for (unsigned i = 0; i < 4; ++i) {
9552 bool isZero = !NonZeroMask[i];
9553 if (isZero)
9554 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9555 else
9556 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9557 }
9558
9559 for (unsigned i = 0; i < 2; ++i) {
9560 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9561 default: llvm_unreachable("Unexpected NonZero count");
9562 case 0:
9563 Ops[i] = Ops[i*2]; // Must be a zero vector.
9564 break;
9565 case 1:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9567 break;
9568 case 2:
9569 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 case 3:
9572 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9573 break;
9574 }
9575 }
9576
9577 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9578 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9579 int MaskVec[] = {
9580 Reverse1 ? 1 : 0,
9581 Reverse1 ? 0 : 1,
9582 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9583 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9584 };
9585 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9586 }
9587
9588 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9589
9590 // Check for a build vector from mostly shuffle plus few inserting.
9591 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9592 return Sh;
9593
9594 // For SSE 4.1, use insertps to put the high elements into the low element.
9595 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9597 if (!Op.getOperand(0).isUndef())
9598 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9599 else
9600 Result = DAG.getUNDEF(VT);
9601
9602 for (unsigned i = 1; i < NumElems; ++i) {
9603 if (Op.getOperand(i).isUndef()) continue;
9604 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9605 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9606 }
9607 return Result;
9608 }
9609
9610 // Otherwise, expand into a number of unpckl*, start by extending each of
9611 // our (non-undef) elements to the full vector width with the element in the
9612 // bottom slot of the vector (which generates no code for SSE).
9613 SmallVector<SDValue, 8> Ops(NumElems);
9614 for (unsigned i = 0; i < NumElems; ++i) {
9615 if (!Op.getOperand(i).isUndef())
9616 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9617 else
9618 Ops[i] = DAG.getUNDEF(VT);
9619 }
9620
9621 // Next, we iteratively mix elements, e.g. for v4f32:
9622 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9623 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9624 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9625 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9626 // Generate scaled UNPCKL shuffle mask.
9627 SmallVector<int, 16> Mask;
9628 for(unsigned i = 0; i != Scale; ++i)
9629 Mask.push_back(i);
9630 for (unsigned i = 0; i != Scale; ++i)
9631 Mask.push_back(NumElems+i);
9632 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9633
9634 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9635 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9636 }
9637 return Ops[0];
9638}
9639
9640// 256-bit AVX can use the vinsertf128 instruction
9641// to create 256-bit vectors from two other 128-bit ones.
9642// TODO: Detect subvector broadcast here instead of DAG combine?
9644 SelectionDAG &DAG,
9645 const X86Subtarget &Subtarget) {
9646 MVT ResVT = Op.getSimpleValueType();
9647 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9648 "Value type must be 256-/512-bit wide");
9649
9650 unsigned NumOperands = Op.getNumOperands();
9651 unsigned NumFreezeUndef = 0;
9652 unsigned NumZero = 0;
9653 unsigned NumNonZero = 0;
9654 unsigned NonZeros = 0;
9655 SmallSet<SDValue, 4> Undefs;
9656 for (unsigned i = 0; i != NumOperands; ++i) {
9657 SDValue SubVec = Op.getOperand(i);
9658 if (SubVec.isUndef())
9659 continue;
9660 if (ISD::isFreezeUndef(SubVec.getNode())) {
9661 // If the freeze(undef) has multiple uses then we must fold to zero.
9662 if (SubVec.hasOneUse()) {
9663 ++NumFreezeUndef;
9664 } else {
9665 ++NumZero;
9666 Undefs.insert(SubVec);
9667 }
9668 }
9669 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9670 ++NumZero;
9671 else {
9672 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9673 NonZeros |= 1 << i;
9674 ++NumNonZero;
9675 }
9676 }
9677
9678 // If we have more than 2 non-zeros, build each half separately.
9679 if (NumNonZero > 2) {
9680 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9681 ArrayRef<SDUse> Ops = Op->ops();
9682 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9683 Ops.slice(0, NumOperands/2));
9684 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9685 Ops.slice(NumOperands/2));
9686 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9687 }
9688
9689 // Otherwise, build it up through insert_subvectors.
9690 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9691 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9692 : DAG.getUNDEF(ResVT));
9693
9694 // Replace Undef operands with ZeroVector.
9695 for (SDValue U : Undefs)
9697 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9698
9699 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9700 unsigned NumSubElems = SubVT.getVectorNumElements();
9701 for (unsigned i = 0; i != NumOperands; ++i) {
9702 if ((NonZeros & (1 << i)) == 0)
9703 continue;
9704
9705 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9706 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9707 }
9708
9709 return Vec;
9710}
9711
9712// Returns true if the given node is a type promotion (by concatenating i1
9713// zeros) of the result of a node that already zeros all upper bits of
9714// k-register.
9715// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG & DAG) {
9719 MVT ResVT = Op.getSimpleValueType();
9720 unsigned NumOperands = Op.getNumOperands();
9721 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9722 "Unexpected number of operands in CONCAT_VECTORS");
9723
9724 uint64_t Zeros = 0;
9725 uint64_t NonZeros = 0;
9726 for (unsigned i = 0; i != NumOperands; ++i) {
9727 SDValue SubVec = Op.getOperand(i);
9728 if (SubVec.isUndef())
9729 continue;
9730 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9731 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9732 Zeros |= (uint64_t)1 << i;
9733 else
9734 NonZeros |= (uint64_t)1 << i;
9735 }
9736
9737 unsigned NumElems = ResVT.getVectorNumElements();
9738
9739 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9740 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9741 // insert_subvector will give us two kshifts.
9742 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9743 Log2_64(NonZeros) != NumOperands - 1) {
9744 unsigned Idx = Log2_64(NonZeros);
9745 SDValue SubVec = Op.getOperand(Idx);
9746 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9747 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9748 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9749 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9750 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9751 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9752 DAG.getVectorIdxConstant(0, dl));
9753 }
9754
9755 // If there are zero or one non-zeros we can handle this very simply.
9756 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9757 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9758 if (!NonZeros)
9759 return Vec;
9760 unsigned Idx = Log2_64(NonZeros);
9761 SDValue SubVec = Op.getOperand(Idx);
9762 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9763 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9764 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9765 }
9766
9767 if (NumOperands > 2) {
9768 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9769 ArrayRef<SDUse> Ops = Op->ops();
9770 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9771 Ops.slice(0, NumOperands / 2));
9772 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9773 Ops.slice(NumOperands / 2));
9774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9775 }
9776
9777 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9778
9779 if (ResVT.getVectorNumElements() >= 16)
9780 return Op; // The operation is legal with KUNPCK
9781
9782 SDValue Vec =
9783 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9784 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9785 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9786 DAG.getVectorIdxConstant(NumElems / 2, dl));
9787}
9788
9790 const X86Subtarget &Subtarget,
9791 SelectionDAG &DAG) {
9792 SDLoc DL(Op);
9793 MVT VT = Op.getSimpleValueType();
9794 if (VT.getVectorElementType() == MVT::i1)
9795 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9796
9797 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9798 // from two other 128-bit ones.
9799 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9800 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9801 (VT.is512BitVector() &&
9802 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9803 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9804}
9805
9806//===----------------------------------------------------------------------===//
9807// Vector shuffle lowering
9808//
9809// This is an experimental code path for lowering vector shuffles on x86. It is
9810// designed to handle arbitrary vector shuffles and blends, gracefully
9811// degrading performance as necessary. It works hard to recognize idiomatic
9812// shuffles and lower them to optimal instruction patterns without leaving
9813// a framework that allows reasonably efficient handling of all vector shuffle
9814// patterns.
9815//===----------------------------------------------------------------------===//
9816
9817/// Checks whether the vector elements referenced by two shuffle masks are
9818/// equivalent.
9819static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9820 int Idx, int ExpectedIdx) {
9821 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9822 ExpectedIdx < MaskSize && "Out of range element index");
9823 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9824 return false;
9825
9826 EVT VT = Op.getValueType();
9827 EVT ExpectedVT = ExpectedOp.getValueType();
9828
9829 // Sources must be vectors and match the mask's element count.
9830 if (!VT.isVector() || !ExpectedVT.isVector() ||
9831 (int)VT.getVectorNumElements() != MaskSize ||
9832 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9833 return false;
9834
9835 // Exact match.
9836 if (Idx == ExpectedIdx && Op == ExpectedOp)
9837 return true;
9838
9839 switch (Op.getOpcode()) {
9840 case ISD::BUILD_VECTOR:
9841 // If the values are build vectors, we can look through them to find
9842 // equivalent inputs that make the shuffles equivalent.
9843 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9844 case ISD::BITCAST: {
9846 EVT SrcVT = Src.getValueType();
9847 if (Op == ExpectedOp && SrcVT.isVector()) {
9848 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9849 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9850 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9851 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9852 Idx / Scale, ExpectedIdx / Scale);
9853 }
9854 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9856 for (unsigned I = 0; I != Scale; ++I)
9857 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 (Idx * Scale) + I,
9859 (ExpectedIdx * Scale) + I))
9860 return false;
9861 return true;
9862 }
9863 }
9864 break;
9865 }
9866 case ISD::VECTOR_SHUFFLE: {
9867 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9868 return Op == ExpectedOp &&
9869 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9870 }
9871 case X86ISD::VBROADCAST:
9873 return Op == ExpectedOp;
9875 if (Op == ExpectedOp) {
9876 auto *MemOp = cast<MemSDNode>(Op);
9877 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9878 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9879 }
9880 break;
9881 case X86ISD::VPERMI: {
9882 if (Op == ExpectedOp) {
9884 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9885 SDValue Src = Op.getOperand(0);
9886 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9887 Mask[ExpectedIdx]);
9888 }
9889 break;
9890 }
9891 case X86ISD::HADD:
9892 case X86ISD::HSUB:
9893 case X86ISD::FHADD:
9894 case X86ISD::FHSUB:
9895 case X86ISD::PACKSS:
9896 case X86ISD::PACKUS:
9897 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9898 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9899 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9900 int NumElts = VT.getVectorNumElements();
9901 int NumLanes = VT.getSizeInBits() / 128;
9902 int NumEltsPerLane = NumElts / NumLanes;
9903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9904 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9905 bool SameElt =
9906 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9907 return SameLane && SameElt;
9908 }
9909 break;
9910 }
9911
9912 return false;
9913}
9914
9915/// Tiny helper function to identify a no-op mask.
9916///
9917/// This is a somewhat boring predicate function. It checks whether the mask
9918/// array input, which is assumed to be a single-input shuffle mask of the kind
9919/// used by the X86 shuffle instructions (not a fully general
9920/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9921/// in-place shuffle are 'no-op's.
9923 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9924 assert(Mask[i] >= -1 && "Out of bound mask element!");
9925 if (Mask[i] >= 0 && Mask[i] != i)
9926 return false;
9927 }
9928 return true;
9929}
9930
9931/// Test whether there are elements crossing LaneSizeInBits lanes in this
9932/// shuffle mask.
9933///
9934/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9935/// and we routinely test for these.
9936static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9937 unsigned ScalarSizeInBits,
9938 ArrayRef<int> Mask) {
9939 assert(LaneSizeInBits && ScalarSizeInBits &&
9940 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9941 "Illegal shuffle lane size");
9942 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9943 int Size = Mask.size();
9944 for (int i = 0; i < Size; ++i)
9945 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9946 return true;
9947 return false;
9948}
9949
9950/// Test whether there are elements crossing 128-bit lanes in this
9951/// shuffle mask.
9953 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9954}
9955
9956/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9957/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9958/// better support 'repeated mask + lane permute' style shuffles.
9959static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9960 unsigned ScalarSizeInBits,
9961 ArrayRef<int> Mask) {
9962 assert(LaneSizeInBits && ScalarSizeInBits &&
9963 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9964 "Illegal shuffle lane size");
9965 int NumElts = Mask.size();
9966 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9967 int NumLanes = NumElts / NumEltsPerLane;
9968 if (NumLanes > 1) {
9969 for (int i = 0; i != NumLanes; ++i) {
9970 int SrcLane = -1;
9971 for (int j = 0; j != NumEltsPerLane; ++j) {
9972 int M = Mask[(i * NumEltsPerLane) + j];
9973 if (M < 0)
9974 continue;
9975 int Lane = (M % NumElts) / NumEltsPerLane;
9976 if (SrcLane >= 0 && SrcLane != Lane)
9977 return true;
9978 SrcLane = Lane;
9979 }
9980 }
9981 }
9982 return false;
9983}
9984
9985/// Test whether a shuffle mask is equivalent within each sub-lane.
9986///
9987/// This checks a shuffle mask to see if it is performing the same
9988/// lane-relative shuffle in each sub-lane. This trivially implies
9989/// that it is also not lane-crossing. It may however involve a blend from the
9990/// same lane of a second vector.
9991///
9992/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9993/// non-trivial to compute in the face of undef lanes. The representation is
9994/// suitable for use with existing 128-bit shuffles as entries from the second
9995/// vector have been remapped to [LaneSize, 2*LaneSize).
9996static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9997 ArrayRef<int> Mask,
9998 SmallVectorImpl<int> &RepeatedMask) {
9999 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10000 RepeatedMask.assign(LaneSize, -1);
10001 int Size = Mask.size();
10002 for (int i = 0; i < Size; ++i) {
10003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10004 if (Mask[i] < 0)
10005 continue;
10006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10007 // This entry crosses lanes, so there is no way to model this shuffle.
10008 return false;
10009
10010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10011 // Adjust second vector indices to start at LaneSize instead of Size.
10012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10013 : Mask[i] % LaneSize + LaneSize;
10014 if (RepeatedMask[i % LaneSize] < 0)
10015 // This is the first non-undef entry in this slot of a 128-bit lane.
10016 RepeatedMask[i % LaneSize] = LocalM;
10017 else if (RepeatedMask[i % LaneSize] != LocalM)
10018 // Found a mismatch with the repeated mask.
10019 return false;
10020 }
10021 return true;
10022}
10023
10024/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10025static bool
10027 SmallVectorImpl<int> &RepeatedMask) {
10028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10029}
10030
10031static bool
10033 SmallVector<int, 32> RepeatedMask;
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10038static bool
10040 SmallVectorImpl<int> &RepeatedMask) {
10041 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10042}
10043
10044/// Test whether a target shuffle mask is equivalent within each sub-lane.
10045/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10046static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10047 unsigned EltSizeInBits,
10048 ArrayRef<int> Mask,
10049 SmallVectorImpl<int> &RepeatedMask) {
10050 int LaneSize = LaneSizeInBits / EltSizeInBits;
10051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10052 int Size = Mask.size();
10053 for (int i = 0; i < Size; ++i) {
10054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10055 if (Mask[i] == SM_SentinelUndef)
10056 continue;
10057 if (Mask[i] == SM_SentinelZero) {
10058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10059 return false;
10060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10061 continue;
10062 }
10063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10064 // This entry crosses lanes, so there is no way to model this shuffle.
10065 return false;
10066
10067 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10068 // later vector indices to start at multiples of LaneSize instead of Size.
10069 int LaneM = Mask[i] / Size;
10070 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10072 // This is the first non-undef entry in this slot of a 128-bit lane.
10073 RepeatedMask[i % LaneSize] = LocalM;
10074 else if (RepeatedMask[i % LaneSize] != LocalM)
10075 // Found a mismatch with the repeated mask.
10076 return false;
10077 }
10078 return true;
10079}
10080
10081/// Test whether a target shuffle mask is equivalent within each sub-lane.
10082/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10083static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10084 ArrayRef<int> Mask,
10085 SmallVectorImpl<int> &RepeatedMask) {
10086 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10087 Mask, RepeatedMask);
10088}
10089
10090/// Checks whether a shuffle mask is equivalent to an explicit list of
10091/// arguments.
10092///
10093/// This is a fast way to test a shuffle mask against a fixed pattern:
10094///
10095/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10096///
10097/// It returns true if the mask is exactly as wide as the argument list, and
10098/// each element of the mask is either -1 (signifying undef) or the value given
10099/// in the argument.
10100static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10101 SDValue V1 = SDValue(),
10102 SDValue V2 = SDValue()) {
10103 int Size = Mask.size();
10104 if (Size != (int)ExpectedMask.size())
10105 return false;
10106
10107 for (int i = 0; i < Size; ++i) {
10108 assert(Mask[i] >= -1 && "Out of bound mask element!");
10109 int MaskIdx = Mask[i];
10110 int ExpectedIdx = ExpectedMask[i];
10111 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10112 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10113 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10114 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10115 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10116 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10117 return false;
10118 }
10119 }
10120 return true;
10121}
10122
10123/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10124///
10125/// The masks must be exactly the same width.
10126///
10127/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10128/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10129///
10130/// SM_SentinelZero is accepted as a valid negative index but must match in
10131/// both, or via a known bits test.
10133 ArrayRef<int> ExpectedMask,
10134 const SelectionDAG &DAG,
10135 SDValue V1 = SDValue(),
10136 SDValue V2 = SDValue()) {
10137 int Size = Mask.size();
10138 if (Size != (int)ExpectedMask.size())
10139 return false;
10140 assert(llvm::all_of(ExpectedMask,
10141 [Size](int M) {
10142 return M == SM_SentinelZero ||
10143 isInRange(M, 0, 2 * Size);
10144 }) &&
10145 "Illegal target shuffle mask");
10146
10147 // Check for out-of-range target shuffle mask indices.
10148 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10149 return false;
10150
10151 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10152 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V1.getValueType().isVector()))
10154 V1 = SDValue();
10155 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10156 !V2.getValueType().isVector()))
10157 V2 = SDValue();
10158
10159 APInt ZeroV1 = APInt::getZero(Size);
10160 APInt ZeroV2 = APInt::getZero(Size);
10161
10162 for (int i = 0; i < Size; ++i) {
10163 int MaskIdx = Mask[i];
10164 int ExpectedIdx = ExpectedMask[i];
10165 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10166 continue;
10167 // If we failed to match an expected SM_SentinelZero then early out.
10168 if (ExpectedIdx < 0)
10169 return false;
10170 if (MaskIdx == SM_SentinelZero) {
10171 // If we need this expected index to be a zero element, then update the
10172 // relevant zero mask and perform the known bits at the end to minimize
10173 // repeated computes.
10174 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10175 if (ExpectedV &&
10176 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10177 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10179 ZeroMask.setBit(BitIdx);
10180 continue;
10181 }
10182 }
10183 if (MaskIdx >= 0) {
10184 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10185 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10186 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10187 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10188 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10189 continue;
10190 }
10191 return false;
10192 }
10193 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10194 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10195}
10196
10197// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10198// instructions.
10200 const SelectionDAG &DAG) {
10201 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10202 return false;
10203
10204 SmallVector<int, 8> Unpcklwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10206 /* Unary = */ false);
10207 SmallVector<int, 8> Unpckhwd;
10208 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10209 /* Unary = */ false);
10210 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10211 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10212 return IsUnpackwdMask;
10213}
10214
10216 const SelectionDAG &DAG) {
10217 // Create 128-bit vector type based on mask size.
10218 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10219 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10220
10221 // We can't assume a canonical shuffle mask, so try the commuted version too.
10222 SmallVector<int, 4> CommutedMask(Mask);
10224
10225 // Match any of unary/binary or low/high.
10226 for (unsigned i = 0; i != 4; ++i) {
10227 SmallVector<int, 16> UnpackMask;
10228 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10229 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10230 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10231 return true;
10232 }
10233 return false;
10234}
10235
10236/// Return true if a shuffle mask chooses elements identically in its top and
10237/// bottom halves. For example, any splat mask has the same top and bottom
10238/// halves. If an element is undefined in only one half of the mask, the halves
10239/// are not considered identical.
10241 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10242 unsigned HalfSize = Mask.size() / 2;
10243 for (unsigned i = 0; i != HalfSize; ++i) {
10244 if (Mask[i] != Mask[i + HalfSize])
10245 return false;
10246 }
10247 return true;
10248}
10249
10250/// Get a 4-lane 8-bit shuffle immediate for a mask.
10251///
10252/// This helper function produces an 8-bit shuffle immediate corresponding to
10253/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10254/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10255/// example.
10256///
10257/// NB: We rely heavily on "undef" masks preserving the input lane.
10258static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10259 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10260 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10261 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10262 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10263 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10264
10265 // If the mask only uses one non-undef element, then fully 'splat' it to
10266 // improve later broadcast matching.
10267 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10268 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10269
10270 int FirstElt = Mask[FirstIndex];
10271 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10272 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10273
10274 unsigned Imm = 0;
10275 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10276 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10277 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10278 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10279 return Imm;
10280}
10281
10283 SelectionDAG &DAG) {
10284 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10285}
10286
10287// Canonicalize SHUFPD mask to improve chances of further folding.
10288// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10289static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10290 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10291 "Unexpected SHUFPD mask size");
10292 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10293 "Unexpected SHUFPD mask elements");
10294
10295 // If the mask only uses one non-undef element, then fully 'splat' it to
10296 // improve later broadcast matching.
10297 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10298 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10299 "All undef shuffle mask");
10300
10301 int FirstElt = Mask[FirstIndex];
10302 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10303 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10304 unsigned Imm = 0;
10305 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10306 Imm |= FirstElt << I;
10307 return Imm;
10308 }
10309
10310 // Attempt to keep any undef elements in place to improve chances of the
10311 // shuffle becoming a (commutative) blend.
10312 unsigned Imm = 0;
10313 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10314 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10315
10316 return Imm;
10317}
10318
10320 SelectionDAG &DAG) {
10321 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10322}
10323
10324// The Shuffle result is as follow:
10325// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10326// Each Zeroable's element correspond to a particular Mask's element.
10327// As described in computeZeroableShuffleElements function.
10328//
10329// The function looks for a sub-mask that the nonzero elements are in
10330// increasing order. If such sub-mask exist. The function returns true.
10331static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10332 ArrayRef<int> Mask, const EVT &VectorType,
10333 bool &IsZeroSideLeft) {
10334 int NextElement = -1;
10335 // Check if the Mask's nonzero elements are in increasing order.
10336 for (int i = 0, e = Mask.size(); i < e; i++) {
10337 // Checks if the mask's zeros elements are built from only zeros.
10338 assert(Mask[i] >= -1 && "Out of bound mask element!");
10339 if (Mask[i] < 0)
10340 return false;
10341 if (Zeroable[i])
10342 continue;
10343 // Find the lowest non zero element
10344 if (NextElement < 0) {
10345 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10346 IsZeroSideLeft = NextElement != 0;
10347 }
10348 // Exit if the mask's non zero elements are not in increasing order.
10349 if (NextElement != Mask[i])
10350 return false;
10351 NextElement++;
10352 }
10353 return true;
10354}
10355
10356static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10358 const X86Subtarget &Subtarget,
10359 unsigned Depth = 0);
10360
10361/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10363 ArrayRef<int> Mask, SDValue V1,
10364 SDValue V2, const APInt &Zeroable,
10365 const X86Subtarget &Subtarget,
10366 SelectionDAG &DAG) {
10367 int Size = Mask.size();
10368 int LaneSize = 128 / VT.getScalarSizeInBits();
10369 const int NumBytes = VT.getSizeInBits() / 8;
10370 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10371
10372 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10373 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10374 (Subtarget.hasBWI() && VT.is512BitVector()));
10375
10376 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10377 // Sign bit set in i8 mask means zero element.
10378 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10379
10380 SDValue V;
10381 for (int i = 0; i < NumBytes; ++i) {
10382 int M = Mask[i / NumEltBytes];
10383 if (M < 0) {
10384 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10385 continue;
10386 }
10387 if (Zeroable[i / NumEltBytes]) {
10388 PSHUFBMask[i] = ZeroMask;
10389 continue;
10390 }
10391
10392 // We can only use a single input of V1 or V2.
10393 SDValue SrcV = (M >= Size ? V2 : V1);
10394 if (V && V != SrcV)
10395 return SDValue();
10396 V = SrcV;
10397 M %= Size;
10398
10399 // PSHUFB can't cross lanes, ensure this doesn't happen.
10400 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10401 return SDValue();
10402
10403 M = M % LaneSize;
10404 M = M * NumEltBytes + (i % NumEltBytes);
10405 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10406 }
10407 assert(V && "Failed to find a source input");
10408
10409 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10410 return DAG.getBitcast(
10411 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10412 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10413}
10414
10415static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10416 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10417 const SDLoc &dl);
10418
10419// X86 has dedicated shuffle that can be lowered to VEXPAND
10421 SDValue V2, ArrayRef<int> Mask,
10422 const APInt &Zeroable,
10423 const X86Subtarget &Subtarget,
10424 SelectionDAG &DAG) {
10425 bool IsLeftZeroSide = true;
10426 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10427 IsLeftZeroSide))
10428 return SDValue();
10429 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10431 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10432 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10433 unsigned NumElts = VT.getVectorNumElements();
10434 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10435 "Unexpected number of vector elements");
10436 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10437 Subtarget, DAG, DL);
10438 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10439 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10440 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10441}
10442
10443static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10444 unsigned &UnpackOpcode, bool IsUnary,
10445 ArrayRef<int> TargetMask, const SDLoc &DL,
10446 SelectionDAG &DAG,
10447 const X86Subtarget &Subtarget) {
10448 int NumElts = VT.getVectorNumElements();
10449
10450 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10451 for (int i = 0; i != NumElts; i += 2) {
10452 int M1 = TargetMask[i + 0];
10453 int M2 = TargetMask[i + 1];
10454 Undef1 &= (SM_SentinelUndef == M1);
10455 Undef2 &= (SM_SentinelUndef == M2);
10456 Zero1 &= isUndefOrZero(M1);
10457 Zero2 &= isUndefOrZero(M2);
10458 }
10459 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10460 "Zeroable shuffle detected");
10461
10462 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10463 SmallVector<int, 64> Unpckl, Unpckh;
10464 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10465 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10466 (IsUnary ? V1 : V2))) {
10467 UnpackOpcode = X86ISD::UNPCKL;
10468 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10469 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10470 return true;
10471 }
10472
10473 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10474 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10475 (IsUnary ? V1 : V2))) {
10476 UnpackOpcode = X86ISD::UNPCKH;
10477 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10478 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10479 return true;
10480 }
10481
10482 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10483 if (IsUnary && (Zero1 || Zero2)) {
10484 // Don't bother if we can blend instead.
10485 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10486 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10487 return false;
10488
10489 bool MatchLo = true, MatchHi = true;
10490 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10491 int M = TargetMask[i];
10492
10493 // Ignore if the input is known to be zero or the index is undef.
10494 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10495 (M == SM_SentinelUndef))
10496 continue;
10497
10498 MatchLo &= (M == Unpckl[i]);
10499 MatchHi &= (M == Unpckh[i]);
10500 }
10501
10502 if (MatchLo || MatchHi) {
10503 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10504 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10506 return true;
10507 }
10508 }
10509
10510 // If a binary shuffle, commute and try again.
10511 if (!IsUnary) {
10513 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10514 UnpackOpcode = X86ISD::UNPCKL;
10515 std::swap(V1, V2);
10516 return true;
10517 }
10518
10520 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10521 UnpackOpcode = X86ISD::UNPCKH;
10522 std::swap(V1, V2);
10523 return true;
10524 }
10525 }
10526
10527 return false;
10528}
10529
10530// X86 has dedicated unpack instructions that can handle specific blend
10531// operations: UNPCKH and UNPCKL.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 SmallVector<int, 8> Unpckl;
10536 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10539
10540 SmallVector<int, 8> Unpckh;
10541 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10542 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10544
10545 // Commute and try again.
10547 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10549
10551 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10553
10554 return SDValue();
10555}
10556
10557/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10558/// followed by unpack 256-bit.
10560 SDValue V2, ArrayRef<int> Mask,
10561 SelectionDAG &DAG) {
10562 SmallVector<int, 32> Unpckl, Unpckh;
10563 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10564 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10565
10566 unsigned UnpackOpcode;
10567 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10568 UnpackOpcode = X86ISD::UNPCKL;
10569 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10570 UnpackOpcode = X86ISD::UNPCKH;
10571 else
10572 return SDValue();
10573
10574 // This is a "natural" unpack operation (rather than the 128-bit sectored
10575 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10576 // input in order to use the x86 instruction.
10577 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10578 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10579 V1 = DAG.getBitcast(VT, V1);
10580 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10581}
10582
10583// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10584// source into the lower elements and zeroing the upper elements.
10585static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10586 ArrayRef<int> Mask, const APInt &Zeroable,
10587 const X86Subtarget &Subtarget) {
10588 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10589 return false;
10590
10591 unsigned NumElts = Mask.size();
10592 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10593 unsigned MaxScale = 64 / EltSizeInBits;
10594
10595 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10596 unsigned SrcEltBits = EltSizeInBits * Scale;
10597 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10598 continue;
10599 unsigned NumSrcElts = NumElts / Scale;
10600 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10601 continue;
10602 unsigned UpperElts = NumElts - NumSrcElts;
10603 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10604 continue;
10605 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10606 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10607 DstVT = MVT::getIntegerVT(EltSizeInBits);
10608 if ((NumSrcElts * EltSizeInBits) >= 128) {
10609 // ISD::TRUNCATE
10610 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10611 } else {
10612 // X86ISD::VTRUNC
10613 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10614 }
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10621// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10622// element padding to the final DstVT.
10623static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10624 const X86Subtarget &Subtarget,
10625 SelectionDAG &DAG, bool ZeroUppers) {
10626 MVT SrcVT = Src.getSimpleValueType();
10627 MVT DstSVT = DstVT.getScalarType();
10628 unsigned NumDstElts = DstVT.getVectorNumElements();
10629 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10630 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10631
10632 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10633 return SDValue();
10634
10635 // Perform a direct ISD::TRUNCATE if possible.
10636 if (NumSrcElts == NumDstElts)
10637 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10638
10639 if (NumSrcElts > NumDstElts) {
10640 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10641 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10642 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10643 }
10644
10645 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10649 DstVT.getSizeInBits());
10650 }
10651
10652 // Non-VLX targets must truncate from a 512-bit type, so we need to
10653 // widen, truncate and then possibly extract the original subvector.
10654 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10655 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10656 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10657 }
10658
10659 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10660 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10661 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10662 if (DstVT != TruncVT)
10663 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10664 DstVT.getSizeInBits());
10665 return Trunc;
10666}
10667
10668// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10669//
10670// An example is the following:
10671//
10672// t0: ch = EntryToken
10673// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10674// t25: v4i32 = truncate t2
10675// t41: v8i16 = bitcast t25
10676// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10677// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10678// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10679// t18: v2i64 = bitcast t51
10680//
10681// One can just use a single vpmovdw instruction, without avx512vl we need to
10682// use the zmm variant and extract the lower subvector, padding with zeroes.
10683// TODO: Merge with lowerShuffleAsVTRUNC.
10685 SDValue V2, ArrayRef<int> Mask,
10686 const APInt &Zeroable,
10687 const X86Subtarget &Subtarget,
10688 SelectionDAG &DAG) {
10689 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10690 if (!Subtarget.hasAVX512())
10691 return SDValue();
10692
10693 unsigned NumElts = VT.getVectorNumElements();
10694 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10695 unsigned MaxScale = 64 / EltSizeInBits;
10696 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10697 unsigned SrcEltBits = EltSizeInBits * Scale;
10698 unsigned NumSrcElts = NumElts / Scale;
10699 unsigned UpperElts = NumElts - NumSrcElts;
10700 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10701 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10702 continue;
10703
10704 // Attempt to find a matching source truncation, but as a fall back VLX
10705 // cases can use the VPMOV directly.
10706 SDValue Src = peekThroughBitcasts(V1);
10707 if (Src.getOpcode() == ISD::TRUNCATE &&
10708 Src.getScalarValueSizeInBits() == SrcEltBits) {
10709 Src = Src.getOperand(0);
10710 } else if (Subtarget.hasVLX()) {
10711 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10713 Src = DAG.getBitcast(SrcVT, Src);
10714 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10715 if (Scale == 2 &&
10716 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10717 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10718 return SDValue();
10719 } else
10720 return SDValue();
10721
10722 // VPMOVWB is only available with avx512bw.
10723 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10724 return SDValue();
10725
10726 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10727 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10728 }
10729
10730 return SDValue();
10731}
10732
10733// Attempt to match binary shuffle patterns as a truncate.
10735 SDValue V2, ArrayRef<int> Mask,
10736 const APInt &Zeroable,
10737 const X86Subtarget &Subtarget,
10738 SelectionDAG &DAG) {
10739 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10740 "Unexpected VTRUNC type");
10741 if (!Subtarget.hasAVX512() ||
10742 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10743 return SDValue();
10744
10745 unsigned NumElts = VT.getVectorNumElements();
10746 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10747 unsigned MaxScale = 64 / EltSizeInBits;
10748 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10749 // TODO: Support non-BWI VPMOVWB truncations?
10750 unsigned SrcEltBits = EltSizeInBits * Scale;
10751 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10752 continue;
10753
10754 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10755 // Bail if the V2 elements are undef.
10756 unsigned NumHalfSrcElts = NumElts / Scale;
10757 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10758 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10759 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10760 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10761 continue;
10762
10763 // The elements beyond the truncation must be undef/zero.
10764 unsigned UpperElts = NumElts - NumSrcElts;
10765 if (UpperElts > 0 &&
10766 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10767 continue;
10768 bool UndefUppers =
10769 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10770
10771 // As we're using both sources then we need to concat them together
10772 // and truncate from the double-sized src.
10773 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10774
10775 // For offset truncations, ensure that the concat is cheap.
10776 SDValue Src =
10777 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10778 if (!Src) {
10779 if (Offset)
10780 continue;
10781 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10782 }
10783
10784 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10785 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10786 Src = DAG.getBitcast(SrcVT, Src);
10787
10788 // Shift the offset'd elements into place for the truncation.
10789 // TODO: Use getTargetVShiftByConstNode.
10790 if (Offset)
10791 Src = DAG.getNode(
10792 X86ISD::VSRLI, DL, SrcVT, Src,
10793 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10794
10795 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10796 }
10797 }
10798
10799 return SDValue();
10800}
10801
10802/// Check whether a compaction lowering can be done by dropping even/odd
10803/// elements and compute how many times even/odd elements must be dropped.
10804///
10805/// This handles shuffles which take every Nth element where N is a power of
10806/// two. Example shuffle masks:
10807///
10808/// (even)
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10810/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10811/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10812/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10813/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10814/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10815///
10816/// (odd)
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10818/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10819///
10820/// Any of these lanes can of course be undef.
10821///
10822/// This routine only supports N <= 3.
10823/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10824/// for larger N.
10825///
10826/// \returns N above, or the number of times even/odd elements must be dropped
10827/// if there is such a number. Otherwise returns zero.
10828static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10829 bool IsSingleInput) {
10830 // The modulus for the shuffle vector entries is based on whether this is
10831 // a single input or not.
10832 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10833 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10834 "We should only be called with masks with a power-of-2 size!");
10835
10836 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10837 int Offset = MatchEven ? 0 : 1;
10838
10839 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10840 // and 2^3 simultaneously. This is because we may have ambiguity with
10841 // partially undef inputs.
10842 bool ViableForN[3] = {true, true, true};
10843
10844 for (int i = 0, e = Mask.size(); i < e; ++i) {
10845 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10846 // want.
10847 if (Mask[i] < 0)
10848 continue;
10849
10850 bool IsAnyViable = false;
10851 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10852 if (ViableForN[j]) {
10853 uint64_t N = j + 1;
10854
10855 // The shuffle mask must be equal to (i * 2^N) % M.
10856 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10857 IsAnyViable = true;
10858 else
10859 ViableForN[j] = false;
10860 }
10861 // Early exit if we exhaust the possible powers of two.
10862 if (!IsAnyViable)
10863 break;
10864 }
10865
10866 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10867 if (ViableForN[j])
10868 return j + 1;
10869
10870 // Return 0 as there is no viable power of two.
10871 return 0;
10872}
10873
10874// X86 has dedicated pack instructions that can handle specific truncation
10875// operations: PACKSS and PACKUS.
10876// Checks for compaction shuffle masks if MaxStages > 1.
10877// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10878static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10879 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10880 const SelectionDAG &DAG,
10881 const X86Subtarget &Subtarget,
10882 unsigned MaxStages = 1) {
10883 unsigned NumElts = VT.getVectorNumElements();
10884 unsigned BitSize = VT.getScalarSizeInBits();
10885 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10886 "Illegal maximum compaction");
10887
10888 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10889 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10890 unsigned NumPackedBits = NumSrcBits - BitSize;
10891 N1 = peekThroughBitcasts(N1);
10892 N2 = peekThroughBitcasts(N2);
10893 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10894 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10895 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10896 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10897 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10898 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10899 return false;
10900 if (Subtarget.hasSSE41() || BitSize == 8) {
10901 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10902 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10903 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10904 V1 = N1;
10905 V2 = N2;
10906 SrcVT = PackVT;
10907 PackOpcode = X86ISD::PACKUS;
10908 return true;
10909 }
10910 }
10911 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10912 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10913 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10914 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10915 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10916 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10917 V1 = N1;
10918 V2 = N2;
10919 SrcVT = PackVT;
10920 PackOpcode = X86ISD::PACKSS;
10921 return true;
10922 }
10923 return false;
10924 };
10925
10926 // Attempt to match against wider and wider compaction patterns.
10927 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10928 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10929 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10930
10931 // Try binary shuffle.
10932 SmallVector<int, 32> BinaryMask;
10933 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10934 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10935 if (MatchPACK(V1, V2, PackVT))
10936 return true;
10937
10938 // Try unary shuffle.
10939 SmallVector<int, 32> UnaryMask;
10940 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10941 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10942 if (MatchPACK(V1, V1, PackVT))
10943 return true;
10944 }
10945
10946 return false;
10947}
10948
10950 SDValue V2, ArrayRef<int> Mask,
10951 const X86Subtarget &Subtarget,
10952 SelectionDAG &DAG) {
10953 MVT PackVT;
10954 unsigned PackOpcode;
10955 unsigned SizeBits = VT.getSizeInBits();
10956 unsigned EltBits = VT.getScalarSizeInBits();
10957 unsigned MaxStages = Log2_32(64 / EltBits);
10958 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10959 Subtarget, MaxStages))
10960 return SDValue();
10961
10962 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10963 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10964
10965 // Don't lower multi-stage packs on AVX512, truncation is better.
10966 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10967 return SDValue();
10968
10969 // Pack to the largest type possible:
10970 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10971 unsigned MaxPackBits = 16;
10972 if (CurrentEltBits > 16 &&
10973 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10974 MaxPackBits = 32;
10975
10976 // Repeatedly pack down to the target size.
10977 SDValue Res;
10978 for (unsigned i = 0; i != NumStages; ++i) {
10979 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10980 unsigned NumSrcElts = SizeBits / SrcEltBits;
10981 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10982 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10983 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10984 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10985 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10986 DAG.getBitcast(SrcVT, V2));
10987 V1 = V2 = Res;
10988 CurrentEltBits /= 2;
10989 }
10990 assert(Res && Res.getValueType() == VT &&
10991 "Failed to lower compaction shuffle");
10992 return Res;
10993}
10994
10995/// Try to emit a bitmask instruction for a shuffle.
10996///
10997/// This handles cases where we can model a blend exactly as a bitmask due to
10998/// one of the inputs being zeroable.
11000 SDValue V2, ArrayRef<int> Mask,
11001 const APInt &Zeroable,
11002 const X86Subtarget &Subtarget,
11003 SelectionDAG &DAG) {
11004 MVT MaskVT = VT;
11005 MVT EltVT = VT.getVectorElementType();
11006 SDValue Zero, AllOnes;
11007 // Use f64 if i64 isn't legal.
11008 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11009 EltVT = MVT::f64;
11010 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11011 }
11012
11013 MVT LogicVT = VT;
11014 if (EltVT.isFloatingPoint()) {
11015 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11016 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11017 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11018 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11019 } else {
11020 Zero = DAG.getConstant(0, DL, EltVT);
11021 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11022 }
11023
11024 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11025 SDValue V;
11026 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11027 if (Zeroable[i])
11028 continue;
11029 if (Mask[i] % Size != i)
11030 return SDValue(); // Not a blend.
11031 if (!V)
11032 V = Mask[i] < Size ? V1 : V2;
11033 else if (V != (Mask[i] < Size ? V1 : V2))
11034 return SDValue(); // Can only let one input through the mask.
11035
11036 VMaskOps[i] = AllOnes;
11037 }
11038 if (!V)
11039 return SDValue(); // No non-zeroable elements!
11040
11041 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11042 VMask = DAG.getBitcast(LogicVT, VMask);
11043 V = DAG.getBitcast(LogicVT, V);
11044 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11045 return DAG.getBitcast(VT, And);
11046}
11047
11048/// Try to emit a blend instruction for a shuffle using bit math.
11049///
11050/// This is used as a fallback approach when first class blend instructions are
11051/// unavailable. Currently it is only suitable for integer vectors, but could
11052/// be generalized for floating point vectors if desirable.
11054 SDValue V2, ArrayRef<int> Mask,
11055 SelectionDAG &DAG) {
11056 assert(VT.isInteger() && "Only supports integer vector types!");
11057 MVT EltVT = VT.getVectorElementType();
11058 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11059 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11061 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11062 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11063 return SDValue(); // Shuffled input!
11064 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11065 }
11066
11067 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11068 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11069}
11070
11072 SDValue PreservedSrc,
11073 const X86Subtarget &Subtarget,
11074 SelectionDAG &DAG);
11075
11078 const APInt &Zeroable, bool &ForceV1Zero,
11079 bool &ForceV2Zero, uint64_t &BlendMask) {
11080 bool V1IsZeroOrUndef =
11082 bool V2IsZeroOrUndef =
11084
11085 BlendMask = 0;
11086 ForceV1Zero = false, ForceV2Zero = false;
11087 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11088
11089 int NumElts = Mask.size();
11090 int NumLanes = VT.getSizeInBits() / 128;
11091 int NumEltsPerLane = NumElts / NumLanes;
11092 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11093
11094 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11095 // then ensure the blend mask part for that lane just references that input.
11096 bool ForceWholeLaneMasks =
11097 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11098
11099 // Attempt to generate the binary blend mask. If an input is zero then
11100 // we can use any lane.
11101 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11102 // Keep track of the inputs used per lane.
11103 bool LaneV1InUse = false;
11104 bool LaneV2InUse = false;
11105 uint64_t LaneBlendMask = 0;
11106 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11107 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11108 int M = Mask[Elt];
11109 if (M == SM_SentinelUndef)
11110 continue;
11111 if (M == Elt || (0 <= M && M < NumElts &&
11112 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11113 Mask[Elt] = Elt;
11114 LaneV1InUse = true;
11115 continue;
11116 }
11117 if (M == (Elt + NumElts) ||
11118 (NumElts <= M &&
11119 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 if (Zeroable[Elt]) {
11126 if (V1IsZeroOrUndef) {
11127 ForceV1Zero = true;
11128 Mask[Elt] = Elt;
11129 LaneV1InUse = true;
11130 continue;
11131 }
11132 if (V2IsZeroOrUndef) {
11133 ForceV2Zero = true;
11134 LaneBlendMask |= 1ull << LaneElt;
11135 Mask[Elt] = Elt + NumElts;
11136 LaneV2InUse = true;
11137 continue;
11138 }
11139 }
11140 return false;
11141 }
11142
11143 // If we only used V2 then splat the lane blend mask to avoid any demanded
11144 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11145 // blend mask bit).
11146 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11147 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11148
11149 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11150 }
11151 return true;
11152}
11153
11154/// Try to emit a blend instruction for a shuffle.
11155///
11156/// This doesn't do any checks for the availability of instructions for blending
11157/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11158/// be matched in the backend with the type given. What it does check for is
11159/// that the shuffle mask is a blend, or convertible into a blend with zero.
11161 SDValue V2, ArrayRef<int> Original,
11162 const APInt &Zeroable,
11163 const X86Subtarget &Subtarget,
11164 SelectionDAG &DAG) {
11165 uint64_t BlendMask = 0;
11166 bool ForceV1Zero = false, ForceV2Zero = false;
11167 SmallVector<int, 64> Mask(Original);
11168 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11169 BlendMask))
11170 return SDValue();
11171
11172 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11173 if (ForceV1Zero)
11174 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11175 if (ForceV2Zero)
11176 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11177
11178 unsigned NumElts = VT.getVectorNumElements();
11179
11180 switch (VT.SimpleTy) {
11181 case MVT::v4i64:
11182 case MVT::v8i32:
11183 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11184 [[fallthrough]];
11185 case MVT::v4f64:
11186 case MVT::v8f32:
11187 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11188 [[fallthrough]];
11189 case MVT::v2f64:
11190 case MVT::v2i64:
11191 case MVT::v4f32:
11192 case MVT::v4i32:
11193 case MVT::v8i16:
11194 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11195 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11196 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11197 case MVT::v16i16: {
11198 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11199 SmallVector<int, 8> RepeatedMask;
11200 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11201 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11202 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11203 BlendMask = 0;
11204 for (int i = 0; i < 8; ++i)
11205 if (RepeatedMask[i] >= 8)
11206 BlendMask |= 1ull << i;
11207 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11208 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11209 }
11210 // Use PBLENDW for lower/upper lanes and then blend lanes.
11211 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11212 // merge to VSELECT where useful.
11213 uint64_t LoMask = BlendMask & 0xFF;
11214 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11215 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11216 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11217 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11218 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11219 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11220 return DAG.getVectorShuffle(
11221 MVT::v16i16, DL, Lo, Hi,
11222 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11223 }
11224 [[fallthrough]];
11225 }
11226 case MVT::v32i8:
11227 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11228 [[fallthrough]];
11229 case MVT::v16i8: {
11230 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11231
11232 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11233 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11234 Subtarget, DAG))
11235 return Masked;
11236
11237 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11238 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11239 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11240 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11241 }
11242
11243 // If we have VPTERNLOG, we can use that as a bit blend.
11244 if (Subtarget.hasVLX())
11245 if (SDValue BitBlend =
11246 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11247 return BitBlend;
11248
11249 // Scale the blend by the number of bytes per element.
11250 int Scale = VT.getScalarSizeInBits() / 8;
11251
11252 // This form of blend is always done on bytes. Compute the byte vector
11253 // type.
11254 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11255
11256 // x86 allows load folding with blendvb from the 2nd source operand. But
11257 // we are still using LLVM select here (see comment below), so that's V1.
11258 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11259 // allow that load-folding possibility.
11260 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11262 std::swap(V1, V2);
11263 }
11264
11265 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11266 // mix of LLVM's code generator and the x86 backend. We tell the code
11267 // generator that boolean values in the elements of an x86 vector register
11268 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11269 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11270 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11271 // of the element (the remaining are ignored) and 0 in that high bit would
11272 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11273 // the LLVM model for boolean values in vector elements gets the relevant
11274 // bit set, it is set backwards and over constrained relative to x86's
11275 // actual model.
11276 SmallVector<SDValue, 32> VSELECTMask;
11277 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11278 for (int j = 0; j < Scale; ++j)
11279 VSELECTMask.push_back(
11280 Mask[i] < 0
11281 ? DAG.getUNDEF(MVT::i8)
11282 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11283
11284 V1 = DAG.getBitcast(BlendVT, V1);
11285 V2 = DAG.getBitcast(BlendVT, V2);
11286 return DAG.getBitcast(
11287 VT,
11288 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11289 V1, V2));
11290 }
11291 case MVT::v16f32:
11292 case MVT::v8f64:
11293 case MVT::v8i64:
11294 case MVT::v16i32:
11295 case MVT::v32i16:
11296 case MVT::v64i8: {
11297 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11298 bool OptForSize = DAG.shouldOptForSize();
11299 if (!OptForSize) {
11300 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11301 Subtarget, DAG))
11302 return Masked;
11303 }
11304
11305 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11306 // masked move.
11307 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11308 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11309 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11310 }
11311 default:
11312 llvm_unreachable("Not a supported integer vector type!");
11313 }
11314}
11315
11316/// Try to lower as a blend of elements from two inputs followed by
11317/// a single-input permutation.
11318///
11319/// This matches the pattern where we can blend elements from two inputs and
11320/// then reduce the shuffle to a single-input permutation.
11322 SDValue V1, SDValue V2,
11323 ArrayRef<int> Mask,
11324 SelectionDAG &DAG,
11325 bool ImmBlends = false) {
11326 // We build up the blend mask while checking whether a blend is a viable way
11327 // to reduce the shuffle.
11328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11330
11331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11332 if (Mask[i] < 0)
11333 continue;
11334
11335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11336
11337 if (BlendMask[Mask[i] % Size] < 0)
11338 BlendMask[Mask[i] % Size] = Mask[i];
11339 else if (BlendMask[Mask[i] % Size] != Mask[i])
11340 return SDValue(); // Can't blend in the needed input!
11341
11342 PermuteMask[i] = Mask[i] % Size;
11343 }
11344
11345 // If only immediate blends, then bail if the blend mask can't be widened to
11346 // i16.
11347 unsigned EltSize = VT.getScalarSizeInBits();
11348 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11349 return SDValue();
11350
11351 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11352 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11353}
11354
11355/// Try to lower as an unpack of elements from two inputs followed by
11356/// a single-input permutation.
11357///
11358/// This matches the pattern where we can unpack elements from two inputs and
11359/// then reduce the shuffle to a single-input (wider) permutation.
11361 SDValue V1, SDValue V2,
11362 ArrayRef<int> Mask,
11363 SelectionDAG &DAG) {
11364 int NumElts = Mask.size();
11365 int NumLanes = VT.getSizeInBits() / 128;
11366 int NumLaneElts = NumElts / NumLanes;
11367 int NumHalfLaneElts = NumLaneElts / 2;
11368
11369 bool MatchLo = true, MatchHi = true;
11370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11371
11372 // Determine UNPCKL/UNPCKH type and operand order.
11373 for (int Elt = 0; Elt != NumElts; ++Elt) {
11374 int M = Mask[Elt];
11375 if (M < 0)
11376 continue;
11377
11378 // Normalize the mask value depending on whether it's V1 or V2.
11379 int NormM = M;
11380 SDValue &Op = Ops[Elt & 1];
11381 if (M < NumElts && (Op.isUndef() || Op == V1))
11382 Op = V1;
11383 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11384 Op = V2;
11385 NormM -= NumElts;
11386 } else
11387 return SDValue();
11388
11389 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11390 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11391 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11392 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11393 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11394 if (MatchLoAnyLane || MatchHiAnyLane) {
11395 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11396 "Failed to match UNPCKLO/UNPCKHI");
11397 break;
11398 }
11399 }
11400 MatchLo &= MatchLoAnyLane;
11401 MatchHi &= MatchHiAnyLane;
11402 if (!MatchLo && !MatchHi)
11403 return SDValue();
11404 }
11405 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11406
11407 // Element indices have changed after unpacking. Calculate permute mask
11408 // so that they will be put back to the position as dictated by the
11409 // original shuffle mask indices.
11410 SmallVector<int, 32> PermuteMask(NumElts, -1);
11411 for (int Elt = 0; Elt != NumElts; ++Elt) {
11412 int M = Mask[Elt];
11413 if (M < 0)
11414 continue;
11415 int NormM = M;
11416 if (NumElts <= M)
11417 NormM -= NumElts;
11418 bool IsFirstOp = M < NumElts;
11419 int BaseMaskElt =
11420 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11421 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11422 PermuteMask[Elt] = BaseMaskElt;
11423 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11424 PermuteMask[Elt] = BaseMaskElt + 1;
11425 assert(PermuteMask[Elt] != -1 &&
11426 "Input mask element is defined but failed to assign permute mask");
11427 }
11428
11429 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11430 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11431 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11432}
11433
11434/// Try to lower a shuffle as a permute of the inputs followed by an
11435/// UNPCK instruction.
11436///
11437/// This specifically targets cases where we end up with alternating between
11438/// the two inputs, and so can permute them into something that feeds a single
11439/// UNPCK instruction. Note that this routine only targets integer vectors
11440/// because for floating point vectors we have a generalized SHUFPS lowering
11441/// strategy that handles everything that doesn't *exactly* match an unpack,
11442/// making this clever lowering unnecessary.
11444 SDValue V1, SDValue V2,
11445 ArrayRef<int> Mask,
11446 const X86Subtarget &Subtarget,
11447 SelectionDAG &DAG) {
11448 int Size = Mask.size();
11449 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11450
11451 // This routine only supports 128-bit integer dual input vectors.
11452 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11453 return SDValue();
11454
11455 int NumLoInputs =
11456 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11457 int NumHiInputs =
11458 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11459
11460 bool UnpackLo = NumLoInputs >= NumHiInputs;
11461
11462 auto TryUnpack = [&](int ScalarSize, int Scale) {
11463 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11464 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11465
11466 for (int i = 0; i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 // Each element of the unpack contains Scale elements from this mask.
11471 int UnpackIdx = i / Scale;
11472
11473 // We only handle the case where V1 feeds the first slots of the unpack.
11474 // We rely on canonicalization to ensure this is the case.
11475 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11476 return SDValue();
11477
11478 // Setup the mask for this input. The indexing is tricky as we have to
11479 // handle the unpack stride.
11480 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11481 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11482 Mask[i] % Size;
11483 }
11484
11485 // If we will have to shuffle both inputs to use the unpack, check whether
11486 // we can just unpack first and shuffle the result. If so, skip this unpack.
11487 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11488 !isNoopShuffleMask(V2Mask))
11489 return SDValue();
11490
11491 // Shuffle the inputs into place.
11492 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11493 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11494
11495 // Cast the inputs to the type we will use to unpack them.
11496 MVT UnpackVT =
11497 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11498 V1 = DAG.getBitcast(UnpackVT, V1);
11499 V2 = DAG.getBitcast(UnpackVT, V2);
11500
11501 // Unpack the inputs and cast the result back to the desired type.
11502 return DAG.getBitcast(
11503 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11504 UnpackVT, V1, V2));
11505 };
11506
11507 // We try each unpack from the largest to the smallest to try and find one
11508 // that fits this mask.
11509 int OrigScalarSize = VT.getScalarSizeInBits();
11510 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11511 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11512 return Unpack;
11513
11514 // If we're shuffling with a zero vector then we're better off not doing
11515 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11518 return SDValue();
11519
11520 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11521 // initial unpack.
11522 if (NumLoInputs == 0 || NumHiInputs == 0) {
11523 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11524 "We have to have *some* inputs!");
11525 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11526
11527 // FIXME: We could consider the total complexity of the permute of each
11528 // possible unpacking. Or at the least we should consider how many
11529 // half-crossings are created.
11530 // FIXME: We could consider commuting the unpacks.
11531
11532 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11533 for (int i = 0; i < Size; ++i) {
11534 if (Mask[i] < 0)
11535 continue;
11536
11537 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11538
11539 PermMask[i] =
11540 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11541 }
11542 return DAG.getVectorShuffle(
11543 VT, DL,
11544 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11545 V1, V2),
11546 DAG.getUNDEF(VT), PermMask);
11547 }
11548
11549 return SDValue();
11550}
11551
11552/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11553/// permuting the elements of the result in place.
11555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11557 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11558 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11559 (VT.is512BitVector() && !Subtarget.hasBWI()))
11560 return SDValue();
11561
11562 // We don't currently support lane crossing permutes.
11563 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11564 return SDValue();
11565
11566 int Scale = VT.getScalarSizeInBits() / 8;
11567 int NumLanes = VT.getSizeInBits() / 128;
11568 int NumElts = VT.getVectorNumElements();
11569 int NumEltsPerLane = NumElts / NumLanes;
11570
11571 // Determine range of mask elts.
11572 bool Blend1 = true;
11573 bool Blend2 = true;
11574 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11575 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11576 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11577 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11578 int M = Mask[Lane + Elt];
11579 if (M < 0)
11580 continue;
11581 if (M < NumElts) {
11582 Blend1 &= (M == (Lane + Elt));
11583 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11584 M = M % NumEltsPerLane;
11585 Range1.first = std::min(Range1.first, M);
11586 Range1.second = std::max(Range1.second, M);
11587 } else {
11588 M -= NumElts;
11589 Blend2 &= (M == (Lane + Elt));
11590 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11591 M = M % NumEltsPerLane;
11592 Range2.first = std::min(Range2.first, M);
11593 Range2.second = std::max(Range2.second, M);
11594 }
11595 }
11596 }
11597
11598 // Bail if we don't need both elements.
11599 // TODO - it might be worth doing this for unary shuffles if the permute
11600 // can be widened.
11601 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11602 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11603 return SDValue();
11604
11605 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11606 return SDValue();
11607
11608 // Rotate the 2 ops so we can access both ranges, then permute the result.
11609 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11610 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11611 SDValue Rotate = DAG.getBitcast(
11612 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11613 DAG.getBitcast(ByteVT, Lo),
11614 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11615 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11616 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11617 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11618 int M = Mask[Lane + Elt];
11619 if (M < 0)
11620 continue;
11621 if (M < NumElts)
11622 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11623 else
11624 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11625 }
11626 }
11627 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11628 };
11629
11630 // Check if the ranges are small enough to rotate from either direction.
11631 if (Range2.second < Range1.first)
11632 return RotateAndPermute(V1, V2, Range1.first, 0);
11633 if (Range1.second < Range2.first)
11634 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11635 return SDValue();
11636}
11637
11639 return isUndefOrEqual(Mask, 0);
11640}
11641
11643 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11644}
11645
11646/// Check if the Mask consists of the same element repeated multiple times.
11648 size_t NumUndefs = 0;
11649 std::optional<int> UniqueElt;
11650 for (int Elt : Mask) {
11651 if (Elt == SM_SentinelUndef) {
11652 NumUndefs++;
11653 continue;
11654 }
11655 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11656 return false;
11657 UniqueElt = Elt;
11658 }
11659 // Make sure the element is repeated enough times by checking the number of
11660 // undefs is small.
11661 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11662}
11663
11664/// Generic routine to decompose a shuffle and blend into independent
11665/// blends and permutes.
11666///
11667/// This matches the extremely common pattern for handling combined
11668/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11669/// operations. It will try to pick the best arrangement of shuffles and
11670/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11672 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11673 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11674 int NumElts = Mask.size();
11675 int NumLanes = VT.getSizeInBits() / 128;
11676 int NumEltsPerLane = NumElts / NumLanes;
11677
11678 // Shuffle the input elements into the desired positions in V1 and V2 and
11679 // unpack/blend them together.
11680 bool IsAlternating = true;
11681 bool V1Zero = true, V2Zero = true;
11682 SmallVector<int, 32> V1Mask(NumElts, -1);
11683 SmallVector<int, 32> V2Mask(NumElts, -1);
11684 SmallVector<int, 32> FinalMask(NumElts, -1);
11685 for (int i = 0; i < NumElts; ++i) {
11686 int M = Mask[i];
11687 if (M >= 0 && M < NumElts) {
11688 V1Mask[i] = M;
11689 FinalMask[i] = i;
11690 V1Zero &= Zeroable[i];
11691 IsAlternating &= (i & 1) == 0;
11692 } else if (M >= NumElts) {
11693 V2Mask[i] = M - NumElts;
11694 FinalMask[i] = i + NumElts;
11695 V2Zero &= Zeroable[i];
11696 IsAlternating &= (i & 1) == 1;
11697 }
11698 }
11699
11700 // If we effectively only demand the 0'th element of \p Input, and not only
11701 // as 0'th element, then broadcast said input,
11702 // and change \p InputMask to be a no-op (identity) mask.
11703 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11704 &DAG](SDValue &Input,
11705 MutableArrayRef<int> InputMask) {
11706 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11707 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11708 !X86::mayFoldLoad(Input, Subtarget)))
11709 return;
11710 if (isNoopShuffleMask(InputMask))
11711 return;
11712 assert(isBroadcastShuffleMask(InputMask) &&
11713 "Expected to demand only the 0'th element.");
11715 for (auto I : enumerate(InputMask)) {
11716 int &InputMaskElt = I.value();
11717 if (InputMaskElt >= 0)
11718 InputMaskElt = I.index();
11719 }
11720 };
11721
11722 // Currently, we may need to produce one shuffle per input, and blend results.
11723 // It is possible that the shuffle for one of the inputs is already a no-op.
11724 // See if we can simplify non-no-op shuffles into broadcasts,
11725 // which we consider to be strictly better than an arbitrary shuffle.
11726 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11728 canonicalizeBroadcastableInput(V1, V1Mask);
11729 canonicalizeBroadcastableInput(V2, V2Mask);
11730 }
11731
11732 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11733 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11734 // the shuffle may be able to fold with a load or other benefit. However, when
11735 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11736 // pre-shuffle first is a better strategy.
11737 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11738 // If we don't have blends, see if we can create a cheap unpack.
11739 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11740 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11741 is128BitUnpackShuffleMask(V2Mask, DAG)))
11742 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11743 DL, VT, V1, V2, Mask, Subtarget, DAG))
11744 return PermUnpack;
11745
11746 // Only prefer immediate blends to unpack/rotate.
11747 if (SDValue BlendPerm =
11748 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11749 return BlendPerm;
11750
11751 // If either input vector provides only a single element which is repeated
11752 // multiple times, unpacking from both input vectors would generate worse
11753 // code. e.g. for
11754 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11755 // it is better to process t4 first to create a vector of t4[0], then unpack
11756 // that vector with t2.
11757 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11759 if (SDValue UnpackPerm =
11760 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11761 return UnpackPerm;
11762
11764 DL, VT, V1, V2, Mask, Subtarget, DAG))
11765 return RotatePerm;
11766
11767 // Unpack/rotate failed - try again with variable blends.
11768 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11769 DAG))
11770 return BlendPerm;
11771
11772 if (VT.getScalarSizeInBits() >= 32)
11773 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11774 DL, VT, V1, V2, Mask, Subtarget, DAG))
11775 return PermUnpack;
11776 }
11777
11778 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11779 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11780 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11781 // than half the elements coming from each source.
11782 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11783 V1Mask.assign(NumElts, -1);
11784 V2Mask.assign(NumElts, -1);
11785 FinalMask.assign(NumElts, -1);
11786 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11787 for (int j = 0; j != NumEltsPerLane; ++j) {
11788 int M = Mask[i + j];
11789 if (M >= 0 && M < NumElts) {
11790 V1Mask[i + (j / 2)] = M;
11791 FinalMask[i + j] = i + (j / 2);
11792 } else if (M >= NumElts) {
11793 V2Mask[i + (j / 2)] = M - NumElts;
11794 FinalMask[i + j] = i + (j / 2) + NumElts;
11795 }
11796 }
11797 }
11798
11799 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11801 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11802}
11803
11804static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11805 const X86Subtarget &Subtarget,
11806 ArrayRef<int> Mask) {
11807 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11808 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11809
11810 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11811 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11812 int MaxSubElts = 64 / EltSizeInBits;
11813 unsigned RotateAmt, NumSubElts;
11814 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11815 MaxSubElts, NumSubElts, RotateAmt))
11816 return -1;
11817 unsigned NumElts = Mask.size();
11818 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11819 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11820 return RotateAmt;
11821}
11822
11823/// Lower shuffle using X86ISD::VROTLI rotations.
11825 ArrayRef<int> Mask,
11826 const X86Subtarget &Subtarget,
11827 SelectionDAG &DAG) {
11828 // Only XOP + AVX512 targets have bit rotation instructions.
11829 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11830 bool IsLegal =
11831 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11832 if (!IsLegal && Subtarget.hasSSE3())
11833 return SDValue();
11834
11835 MVT RotateVT;
11836 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11837 Subtarget, Mask);
11838 if (RotateAmt < 0)
11839 return SDValue();
11840
11841 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11842 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11843 // widen to vXi16 or more then existing lowering should will be better.
11844 if (!IsLegal) {
11845 if ((RotateAmt % 16) == 0)
11846 return SDValue();
11847 // TODO: Use getTargetVShiftByConstNode.
11848 unsigned ShlAmt = RotateAmt;
11849 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11850 V1 = DAG.getBitcast(RotateVT, V1);
11851 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11852 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11853 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11854 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11855 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11856 return DAG.getBitcast(VT, Rot);
11857 }
11858
11859 SDValue Rot =
11860 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11861 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11862 return DAG.getBitcast(VT, Rot);
11863}
11864
11865/// Try to match a vector shuffle as an element rotation.
11866///
11867/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11869 ArrayRef<int> Mask) {
11870 int NumElts = Mask.size();
11871
11872 // We need to detect various ways of spelling a rotation:
11873 // [11, 12, 13, 14, 15, 0, 1, 2]
11874 // [-1, 12, 13, 14, -1, -1, 1, -1]
11875 // [-1, -1, -1, -1, -1, -1, 1, 2]
11876 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11877 // [-1, 4, 5, 6, -1, -1, 9, -1]
11878 // [-1, 4, 5, 6, -1, -1, -1, -1]
11879 int Rotation = 0;
11880 SDValue Lo, Hi;
11881 for (int i = 0; i < NumElts; ++i) {
11882 int M = Mask[i];
11883 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11884 "Unexpected mask index.");
11885 if (M < 0)
11886 continue;
11887
11888 // Determine where a rotated vector would have started.
11889 int StartIdx = i - (M % NumElts);
11890 if (StartIdx == 0)
11891 // The identity rotation isn't interesting, stop.
11892 return -1;
11893
11894 // If we found the tail of a vector the rotation must be the missing
11895 // front. If we found the head of a vector, it must be how much of the
11896 // head.
11897 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11898
11899 if (Rotation == 0)
11900 Rotation = CandidateRotation;
11901 else if (Rotation != CandidateRotation)
11902 // The rotations don't match, so we can't match this mask.
11903 return -1;
11904
11905 // Compute which value this mask is pointing at.
11906 SDValue MaskV = M < NumElts ? V1 : V2;
11907
11908 // Compute which of the two target values this index should be assigned
11909 // to. This reflects whether the high elements are remaining or the low
11910 // elements are remaining.
11911 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11912
11913 // Either set up this value if we've not encountered it before, or check
11914 // that it remains consistent.
11915 if (!TargetV)
11916 TargetV = MaskV;
11917 else if (TargetV != MaskV)
11918 // This may be a rotation, but it pulls from the inputs in some
11919 // unsupported interleaving.
11920 return -1;
11921 }
11922
11923 // Check that we successfully analyzed the mask, and normalize the results.
11924 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11925 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11926 if (!Lo)
11927 Lo = Hi;
11928 else if (!Hi)
11929 Hi = Lo;
11930
11931 V1 = Lo;
11932 V2 = Hi;
11933
11934 return Rotation;
11935}
11936
11937/// Try to lower a vector shuffle as a byte rotation.
11938///
11939/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11940/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11941/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11942/// try to generically lower a vector shuffle through such an pattern. It
11943/// does not check for the profitability of lowering either as PALIGNR or
11944/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11945/// This matches shuffle vectors that look like:
11946///
11947/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11948///
11949/// Essentially it concatenates V1 and V2, shifts right by some number of
11950/// elements, and takes the low elements as the result. Note that while this is
11951/// specified as a *right shift* because x86 is little-endian, it is a *left
11952/// rotate* of the vector lanes.
11954 ArrayRef<int> Mask) {
11955 // Don't accept any shuffles with zero elements.
11956 if (isAnyZero(Mask))
11957 return -1;
11958
11959 // PALIGNR works on 128-bit lanes.
11960 SmallVector<int, 16> RepeatedMask;
11961 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11962 return -1;
11963
11964 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11965 if (Rotation <= 0)
11966 return -1;
11967
11968 // PALIGNR rotates bytes, so we need to scale the
11969 // rotation based on how many bytes are in the vector lane.
11970 int NumElts = RepeatedMask.size();
11971 int Scale = 16 / NumElts;
11972 return Rotation * Scale;
11973}
11974
11976 SDValue V2, ArrayRef<int> Mask,
11977 const X86Subtarget &Subtarget,
11978 SelectionDAG &DAG) {
11979 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11980
11981 SDValue Lo = V1, Hi = V2;
11982 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11983 if (ByteRotation <= 0)
11984 return SDValue();
11985
11986 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11987 // PSLLDQ/PSRLDQ.
11988 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11989 Lo = DAG.getBitcast(ByteVT, Lo);
11990 Hi = DAG.getBitcast(ByteVT, Hi);
11991
11992 // SSSE3 targets can use the palignr instruction.
11993 if (Subtarget.hasSSSE3()) {
11994 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11995 "512-bit PALIGNR requires BWI instructions");
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11998 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11999 }
12000
12001 assert(VT.is128BitVector() &&
12002 "Rotate-based lowering only supports 128-bit lowering!");
12003 assert(Mask.size() <= 16 &&
12004 "Can shuffle at most 16 bytes in a 128-bit vector!");
12005 assert(ByteVT == MVT::v16i8 &&
12006 "SSE2 rotate lowering only needed for v16i8!");
12007
12008 // Default SSE2 implementation
12009 int LoByteShift = 16 - ByteRotation;
12010 int HiByteShift = ByteRotation;
12011
12012 SDValue LoShift =
12013 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12014 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12015 SDValue HiShift =
12016 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12017 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12018 return DAG.getBitcast(VT,
12019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12020}
12021
12022/// Try to lower a vector shuffle as a dword/qword rotation.
12023///
12024/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12025/// rotation of the concatenation of two vectors; This routine will
12026/// try to generically lower a vector shuffle through such an pattern.
12027///
12028/// Essentially it concatenates V1 and V2, shifts right by some number of
12029/// elements, and takes the low elements as the result. Note that while this is
12030/// specified as a *right shift* because x86 is little-endian, it is a *left
12031/// rotate* of the vector lanes.
12033 SDValue V2, ArrayRef<int> Mask,
12034 const APInt &Zeroable,
12035 const X86Subtarget &Subtarget,
12036 SelectionDAG &DAG) {
12037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12038 "Only 32-bit and 64-bit elements are supported!");
12039
12040 // 128/256-bit vectors are only supported with VLX.
12041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12042 && "VLX required for 128/256-bit vectors");
12043
12044 SDValue Lo = V1, Hi = V2;
12045 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12046 if (0 < Rotation)
12047 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12048 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12049
12050 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12051 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12052 // TODO: We can probably make this more aggressive and use shift-pairs like
12053 // lowerShuffleAsByteShiftMask.
12054 unsigned NumElts = Mask.size();
12055 unsigned ZeroLo = Zeroable.countr_one();
12056 unsigned ZeroHi = Zeroable.countl_one();
12057 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12058 if (!ZeroLo && !ZeroHi)
12059 return SDValue();
12060
12061 if (ZeroLo) {
12062 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12063 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12064 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12065 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12066 getZeroVector(VT, Subtarget, DAG, DL),
12067 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12068 }
12069
12070 if (ZeroHi) {
12071 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12072 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12073 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12074 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12075 getZeroVector(VT, Subtarget, DAG, DL), Src,
12076 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12077 }
12078
12079 return SDValue();
12080}
12081
12082/// Try to lower a vector shuffle as a byte shift sequence.
12084 SDValue V2, ArrayRef<int> Mask,
12085 const APInt &Zeroable,
12086 const X86Subtarget &Subtarget,
12087 SelectionDAG &DAG) {
12088 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12089 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12090
12091 // We need a shuffle that has zeros at one/both ends and a sequential
12092 // shuffle from one source within.
12093 unsigned ZeroLo = Zeroable.countr_one();
12094 unsigned ZeroHi = Zeroable.countl_one();
12095 if (!ZeroLo && !ZeroHi)
12096 return SDValue();
12097
12098 unsigned NumElts = Mask.size();
12099 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12100 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12101 return SDValue();
12102
12103 unsigned Scale = VT.getScalarSizeInBits() / 8;
12104 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12105 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12106 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12107 return SDValue();
12108
12109 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12110 Res = DAG.getBitcast(MVT::v16i8, Res);
12111
12112 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12113 // inner sequential set of elements, possibly offset:
12114 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12115 // 01234567 --> 4567zzzz --> zzzzz456
12116 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12117 if (ZeroLo == 0) {
12118 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12119 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12120 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12123 } else if (ZeroHi == 0) {
12124 unsigned Shift = Mask[ZeroLo] % NumElts;
12125 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12129 } else if (!Subtarget.hasSSSE3()) {
12130 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12131 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12132 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12133 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12134 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Shift += Mask[ZeroLo] % NumElts;
12137 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12138 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12139 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12140 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12141 } else
12142 return SDValue();
12143
12144 return DAG.getBitcast(VT, Res);
12145}
12146
12147/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12148///
12149/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12150/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12151/// matches elements from one of the input vectors shuffled to the left or
12152/// right with zeroable elements 'shifted in'. It handles both the strictly
12153/// bit-wise element shifts and the byte shift across an entire 128-bit double
12154/// quad word lane.
12155///
12156/// PSHL : (little-endian) left bit shift.
12157/// [ zz, 0, zz, 2 ]
12158/// [ -1, 4, zz, -1 ]
12159/// PSRL : (little-endian) right bit shift.
12160/// [ 1, zz, 3, zz]
12161/// [ -1, -1, 7, zz]
12162/// PSLLDQ : (little-endian) left byte shift
12163/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12164/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12165/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12166/// PSRLDQ : (little-endian) right byte shift
12167/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12168/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12169/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12170static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12171 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12172 int MaskOffset, const APInt &Zeroable,
12173 const X86Subtarget &Subtarget) {
12174 int Size = Mask.size();
12175 unsigned SizeInBits = Size * ScalarSizeInBits;
12176
12177 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12178 for (int i = 0; i < Size; i += Scale)
12179 for (int j = 0; j < Shift; ++j)
12180 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12181 return false;
12182
12183 return true;
12184 };
12185
12186 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12187 for (int i = 0; i != Size; i += Scale) {
12188 unsigned Pos = Left ? i + Shift : i;
12189 unsigned Low = Left ? i : i + Shift;
12190 unsigned Len = Scale - Shift;
12191 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12192 return -1;
12193 }
12194
12195 int ShiftEltBits = ScalarSizeInBits * Scale;
12196 bool ByteShift = ShiftEltBits > 64;
12197 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12198 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12199 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12200
12201 // Normalize the scale for byte shifts to still produce an i64 element
12202 // type.
12203 Scale = ByteShift ? Scale / 2 : Scale;
12204
12205 // We need to round trip through the appropriate type for the shift.
12206 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12207 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12208 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12209 return (int)ShiftAmt;
12210 };
12211
12212 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12213 // keep doubling the size of the integer elements up to that. We can
12214 // then shift the elements of the integer vector by whole multiples of
12215 // their width within the elements of the larger integer vector. Test each
12216 // multiple to see if we can find a match with the moved element indices
12217 // and that the shifted in elements are all zeroable.
12218 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12219 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12220 for (int Shift = 1; Shift != Scale; ++Shift)
12221 for (bool Left : {true, false})
12222 if (CheckZeros(Shift, Scale, Left)) {
12223 int ShiftAmt = MatchShift(Shift, Scale, Left);
12224 if (0 < ShiftAmt)
12225 return ShiftAmt;
12226 }
12227
12228 // no match
12229 return -1;
12230}
12231
12233 SDValue V2, ArrayRef<int> Mask,
12234 const APInt &Zeroable,
12235 const X86Subtarget &Subtarget,
12236 SelectionDAG &DAG, bool BitwiseOnly) {
12237 int Size = Mask.size();
12238 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12239
12240 MVT ShiftVT;
12241 SDValue V = V1;
12242 unsigned Opcode;
12243
12244 // Try to match shuffle against V1 shift.
12245 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12246 Mask, 0, Zeroable, Subtarget);
12247
12248 // If V1 failed, try to match shuffle against V2 shift.
12249 if (ShiftAmt < 0) {
12250 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12251 Mask, Size, Zeroable, Subtarget);
12252 V = V2;
12253 }
12254
12255 if (ShiftAmt < 0)
12256 return SDValue();
12257
12258 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12259 return SDValue();
12260
12261 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12262 "Illegal integer vector type");
12263 V = DAG.getBitcast(ShiftVT, V);
12264 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12265 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12266 return DAG.getBitcast(VT, V);
12267}
12268
12269// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12270// Remainder of lower half result is zero and upper half is all undef.
12271static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12272 ArrayRef<int> Mask, uint64_t &BitLen,
12273 uint64_t &BitIdx, const APInt &Zeroable) {
12274 int Size = Mask.size();
12275 int HalfSize = Size / 2;
12276 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12277 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12278
12279 // Upper half must be undefined.
12280 if (!isUndefUpperHalf(Mask))
12281 return false;
12282
12283 // Determine the extraction length from the part of the
12284 // lower half that isn't zeroable.
12285 int Len = HalfSize;
12286 for (; Len > 0; --Len)
12287 if (!Zeroable[Len - 1])
12288 break;
12289 assert(Len > 0 && "Zeroable shuffle mask");
12290
12291 // Attempt to match first Len sequential elements from the lower half.
12292 SDValue Src;
12293 int Idx = -1;
12294 for (int i = 0; i != Len; ++i) {
12295 int M = Mask[i];
12296 if (M == SM_SentinelUndef)
12297 continue;
12298 SDValue &V = (M < Size ? V1 : V2);
12299 M = M % Size;
12300
12301 // The extracted elements must start at a valid index and all mask
12302 // elements must be in the lower half.
12303 if (i > M || M >= HalfSize)
12304 return false;
12305
12306 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12307 Src = V;
12308 Idx = M - i;
12309 continue;
12310 }
12311 return false;
12312 }
12313
12314 if (!Src || Idx < 0)
12315 return false;
12316
12317 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12318 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12319 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12320 V1 = Src;
12321 return true;
12322}
12323
12324// INSERTQ: Extract lowest Len elements from lower half of second source and
12325// insert over first source, starting at Idx.
12326// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12327static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12328 ArrayRef<int> Mask, uint64_t &BitLen,
12329 uint64_t &BitIdx) {
12330 int Size = Mask.size();
12331 int HalfSize = Size / 2;
12332 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12333
12334 // Upper half must be undefined.
12335 if (!isUndefUpperHalf(Mask))
12336 return false;
12337
12338 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12339 SDValue Base;
12340
12341 // Attempt to match first source from mask before insertion point.
12342 if (isUndefInRange(Mask, 0, Idx)) {
12343 /* EMPTY */
12344 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12345 Base = V1;
12346 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12347 Base = V2;
12348 } else {
12349 continue;
12350 }
12351
12352 // Extend the extraction length looking to match both the insertion of
12353 // the second source and the remaining elements of the first.
12354 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12355 SDValue Insert;
12356 int Len = Hi - Idx;
12357
12358 // Match insertion.
12359 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12360 Insert = V1;
12361 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12362 Insert = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 // Match the remaining elements of the lower half.
12368 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12369 /* EMPTY */
12370 } else if ((!Base || (Base == V1)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12372 Base = V1;
12373 } else if ((!Base || (Base == V2)) &&
12374 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12375 Size + Hi)) {
12376 Base = V2;
12377 } else {
12378 continue;
12379 }
12380
12381 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12382 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12383 V1 = Base;
12384 V2 = Insert;
12385 return true;
12386 }
12387 }
12388
12389 return false;
12390}
12391
12392/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12394 SDValue V2, ArrayRef<int> Mask,
12395 const APInt &Zeroable, SelectionDAG &DAG) {
12396 uint64_t BitLen, BitIdx;
12397 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12398 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12399 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12400 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12401
12402 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12403 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12404 V2 ? V2 : DAG.getUNDEF(VT),
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 return SDValue();
12409}
12410
12411/// Lower a vector shuffle as an any/signed/zero extension.
12412///
12413/// Given a specific number of elements, element bit width, and extension
12414/// stride, produce either an extension based on the available
12415/// features of the subtarget. The extended elements are consecutive and
12416/// begin and can start from an offsetted element index in the input; to
12417/// avoid excess shuffling the offset must either being in the bottom lane
12418/// or at the start of a higher lane. All extended elements must be from
12419/// the same lane.
12421 int Scale, int Offset,
12422 unsigned ExtOpc, SDValue InputV,
12423 ArrayRef<int> Mask,
12424 const X86Subtarget &Subtarget,
12425 SelectionDAG &DAG) {
12426 assert(Scale > 1 && "Need a scale to extend.");
12427 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12428 int EltBits = VT.getScalarSizeInBits();
12429 int NumElements = VT.getVectorNumElements();
12430 int NumEltsPerLane = 128 / EltBits;
12431 int OffsetLane = Offset / NumEltsPerLane;
12432 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12433 "Only 8, 16, and 32 bit elements can be extended.");
12434 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12435 assert(0 <= Offset && "Extension offset must be positive.");
12436 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12437 "Extension offset must be in the first lane or start an upper lane.");
12438
12439 // Check that an index is in same lane as the base offset.
12440 auto SafeOffset = [&](int Idx) {
12441 return OffsetLane == (Idx / NumEltsPerLane);
12442 };
12443
12444 // Shift along an input so that the offset base moves to the first element.
12445 auto ShuffleOffset = [&](SDValue V) {
12446 if (!Offset)
12447 return V;
12448
12449 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12450 for (int i = 0; i * Scale < NumElements; ++i) {
12451 int SrcIdx = i + Offset;
12452 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12453 }
12454 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12455 };
12456
12457 // Found a valid a/zext mask! Try various lowering strategies based on the
12458 // input type and available ISA extensions.
12459 if (Subtarget.hasSSE41()) {
12460 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12461 // PUNPCK will catch this in a later shuffle match.
12462 if (Offset && Scale == 2 && VT.is128BitVector())
12463 return SDValue();
12464 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12465 NumElements / Scale);
12466 InputV = DAG.getBitcast(VT, InputV);
12467 InputV = ShuffleOffset(InputV);
12468 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12469 return DAG.getBitcast(VT, InputV);
12470 }
12471
12472 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12473 InputV = DAG.getBitcast(VT, InputV);
12474 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12475
12476 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12477 if (ExtOpc == ISD::SIGN_EXTEND)
12478 return SDValue();
12479
12480 // For any extends we can cheat for larger element sizes and use shuffle
12481 // instructions that can fold with a load and/or copy.
12482 if (AnyExt && EltBits == 32) {
12483 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12484 -1};
12485 return DAG.getBitcast(
12486 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12487 DAG.getBitcast(MVT::v4i32, InputV),
12488 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12489 }
12490 if (AnyExt && EltBits == 16 && Scale > 2) {
12491 int PSHUFDMask[4] = {Offset / 2, -1,
12492 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12493 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12494 DAG.getBitcast(MVT::v4i32, InputV),
12495 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12496 int PSHUFWMask[4] = {1, -1, -1, -1};
12497 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12498 return DAG.getBitcast(
12499 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12500 DAG.getBitcast(MVT::v8i16, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12502 }
12503
12504 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12505 // to 64-bits.
12506 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12507 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12508 assert(VT.is128BitVector() && "Unexpected vector width!");
12509
12510 int LoIdx = Offset * EltBits;
12511 SDValue Lo = DAG.getBitcast(
12512 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12513 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12514 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12515
12516 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12517 return DAG.getBitcast(VT, Lo);
12518
12519 int HiIdx = (Offset + 1) * EltBits;
12520 SDValue Hi = DAG.getBitcast(
12521 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12522 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12523 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12524 return DAG.getBitcast(VT,
12525 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12526 }
12527
12528 // If this would require more than 2 unpack instructions to expand, use
12529 // pshufb when available. We can only use more than 2 unpack instructions
12530 // when zero extending i8 elements which also makes it easier to use pshufb.
12531 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12532 assert(NumElements == 16 && "Unexpected byte vector width!");
12533 SDValue PSHUFBMask[16];
12534 for (int i = 0; i < 16; ++i) {
12535 int Idx = Offset + (i / Scale);
12536 if ((i % Scale == 0 && SafeOffset(Idx))) {
12537 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12538 continue;
12539 }
12540 PSHUFBMask[i] =
12541 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12542 }
12543 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12544 return DAG.getBitcast(
12545 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12546 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12547 }
12548
12549 // If we are extending from an offset, ensure we start on a boundary that
12550 // we can unpack from.
12551 int AlignToUnpack = Offset % (NumElements / Scale);
12552 if (AlignToUnpack) {
12553 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12554 for (int i = AlignToUnpack; i < NumElements; ++i)
12555 ShMask[i - AlignToUnpack] = i;
12556 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12557 Offset -= AlignToUnpack;
12558 }
12559
12560 // Otherwise emit a sequence of unpacks.
12561 do {
12562 unsigned UnpackLoHi = X86ISD::UNPCKL;
12563 if (Offset >= (NumElements / 2)) {
12564 UnpackLoHi = X86ISD::UNPCKH;
12565 Offset -= (NumElements / 2);
12566 }
12567
12568 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12569 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12570 : getZeroVector(InputVT, Subtarget, DAG, DL);
12571 InputV = DAG.getBitcast(InputVT, InputV);
12572 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12573 Scale /= 2;
12574 EltBits *= 2;
12575 NumElements /= 2;
12576 } while (Scale > 1);
12577 return DAG.getBitcast(VT, InputV);
12578}
12579
12580/// Try to lower a vector shuffle as a zero extension on any microarch.
12581///
12582/// This routine will try to do everything in its power to cleverly lower
12583/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12584/// check for the profitability of this lowering, it tries to aggressively
12585/// match this pattern. It will use all of the micro-architectural details it
12586/// can to emit an efficient lowering. It handles both blends with all-zero
12587/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12588/// masking out later).
12589///
12590/// The reason we have dedicated lowering for zext-style shuffles is that they
12591/// are both incredibly common and often quite performance sensitive.
12593 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12594 const APInt &Zeroable, const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 int Bits = VT.getSizeInBits();
12597 int NumLanes = Bits / 128;
12598 int NumElements = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElements / NumLanes;
12600 assert(VT.getScalarSizeInBits() <= 32 &&
12601 "Exceeds 32-bit integer zero extension limit");
12602 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12603
12604 // Define a helper function to check a particular ext-scale and lower to it if
12605 // valid.
12606 auto Lower = [&](int Scale) -> SDValue {
12607 SDValue InputV;
12608 bool AnyExt = true;
12609 int Offset = 0;
12610 int Matches = 0;
12611 for (int i = 0; i < NumElements; ++i) {
12612 int M = Mask[i];
12613 if (M < 0)
12614 continue; // Valid anywhere but doesn't tell us anything.
12615 if (i % Scale != 0) {
12616 // Each of the extended elements need to be zeroable.
12617 if (!Zeroable[i])
12618 return SDValue();
12619
12620 // We no longer are in the anyext case.
12621 AnyExt = false;
12622 continue;
12623 }
12624
12625 // Each of the base elements needs to be consecutive indices into the
12626 // same input vector.
12627 SDValue V = M < NumElements ? V1 : V2;
12628 M = M % NumElements;
12629 if (!InputV) {
12630 InputV = V;
12631 Offset = M - (i / Scale);
12632 } else if (InputV != V)
12633 return SDValue(); // Flip-flopping inputs.
12634
12635 // Offset must start in the lowest 128-bit lane or at the start of an
12636 // upper lane.
12637 // FIXME: Is it ever worth allowing a negative base offset?
12638 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12639 (Offset % NumEltsPerLane) == 0))
12640 return SDValue();
12641
12642 // If we are offsetting, all referenced entries must come from the same
12643 // lane.
12644 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12645 return SDValue();
12646
12647 if ((M % NumElements) != (Offset + (i / Scale)))
12648 return SDValue(); // Non-consecutive strided elements.
12649 Matches++;
12650 }
12651
12652 // If we fail to find an input, we have a zero-shuffle which should always
12653 // have already been handled.
12654 // FIXME: Maybe handle this here in case during blending we end up with one?
12655 if (!InputV)
12656 return SDValue();
12657
12658 // If we are offsetting, don't extend if we only match a single input, we
12659 // can always do better by using a basic PSHUF or PUNPCK.
12660 if (Offset != 0 && Matches < 2)
12661 return SDValue();
12662
12663 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12664 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12665 InputV, Mask, Subtarget, DAG);
12666 };
12667
12668 // The widest scale possible for extending is to a 64-bit integer.
12669 assert(Bits % 64 == 0 &&
12670 "The number of bits in a vector must be divisible by 64 on x86!");
12671 int NumExtElements = Bits / 64;
12672
12673 // Each iteration, try extending the elements half as much, but into twice as
12674 // many elements.
12675 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12676 assert(NumElements % NumExtElements == 0 &&
12677 "The input vector size must be divisible by the extended size.");
12678 if (SDValue V = Lower(NumElements / NumExtElements))
12679 return V;
12680 }
12681
12682 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12683 if (Bits != 128)
12684 return SDValue();
12685
12686 // Returns one of the source operands if the shuffle can be reduced to a
12687 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12688 auto CanZExtLowHalf = [&]() {
12689 for (int i = NumElements / 2; i != NumElements; ++i)
12690 if (!Zeroable[i])
12691 return SDValue();
12692 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12693 return V1;
12694 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12695 return V2;
12696 return SDValue();
12697 };
12698
12699 if (SDValue V = CanZExtLowHalf()) {
12700 V = DAG.getBitcast(MVT::v2i64, V);
12701 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12702 return DAG.getBitcast(VT, V);
12703 }
12704
12705 // No viable ext lowering found.
12706 return SDValue();
12707}
12708
12709/// Try to get a scalar value for a specific element of a vector.
12710///
12711/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12713 SelectionDAG &DAG) {
12714 MVT VT = V.getSimpleValueType();
12715 MVT EltVT = VT.getVectorElementType();
12716 V = peekThroughBitcasts(V);
12717
12718 // If the bitcasts shift the element size, we can't extract an equivalent
12719 // element from it.
12720 MVT NewVT = V.getSimpleValueType();
12721 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12722 return SDValue();
12723
12724 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12725 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12726 // Ensure the scalar operand is the same size as the destination.
12727 // FIXME: Add support for scalar truncation where possible.
12728 SDValue S = V.getOperand(Idx);
12729 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12730 return DAG.getBitcast(EltVT, S);
12731 }
12732
12733 return SDValue();
12734}
12735
12736/// Helper to test for a load that can be folded with x86 shuffles.
12737///
12738/// This is particularly important because the set of instructions varies
12739/// significantly based on whether the operand is a load or not.
12741 return V.hasOneUse() &&
12743}
12744
12745template<typename T>
12746static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12747 T EltVT = VT.getScalarType();
12748 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12749 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12750}
12751
12752/// Try to lower insertion of a single element into a zero vector.
12753///
12754/// This is a common pattern that we have especially efficient patterns to lower
12755/// across all subtarget feature sets.
12757 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12758 const APInt &Zeroable, const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 MVT ExtVT = VT;
12761 MVT EltVT = VT.getVectorElementType();
12762 unsigned NumElts = VT.getVectorNumElements();
12763 unsigned EltBits = VT.getScalarSizeInBits();
12764
12765 if (isSoftF16(EltVT, Subtarget))
12766 return SDValue();
12767
12768 int V2Index =
12769 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12770 Mask.begin();
12771 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12772 bool IsV1Zeroable = true;
12773 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12774 if (i != V2Index && !Zeroable[i]) {
12775 IsV1Zeroable = false;
12776 break;
12777 }
12778
12779 // Bail if a non-zero V1 isn't used in place.
12780 if (!IsV1Zeroable) {
12781 SmallVector<int, 8> V1Mask(Mask);
12782 V1Mask[V2Index] = -1;
12783 if (!isNoopShuffleMask(V1Mask))
12784 return SDValue();
12785 }
12786
12787 // Check for a single input from a SCALAR_TO_VECTOR node.
12788 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12789 // all the smarts here sunk into that routine. However, the current
12790 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12791 // vector shuffle lowering is dead.
12792 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12793 DAG);
12794 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12795 // We need to zext the scalar if it is smaller than an i32.
12796 V2S = DAG.getBitcast(EltVT, V2S);
12797 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12798 // Using zext to expand a narrow element won't work for non-zero
12799 // insertions. But we can use a masked constant vector if we're
12800 // inserting V2 into the bottom of V1.
12801 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12802 return SDValue();
12803
12804 // Zero-extend directly to i32.
12805 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12806 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12807
12808 // If we're inserting into a constant, mask off the inserted index
12809 // and OR with the zero-extended scalar.
12810 if (!IsV1Zeroable) {
12811 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12812 Bits[V2Index] = APInt::getZero(EltBits);
12813 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12814 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12815 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12816 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12817 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12818 }
12819 }
12820 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12821 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12822 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12823 // Either not inserting from the low element of the input or the input
12824 // element size is too small to use VZEXT_MOVL to clear the high bits.
12825 return SDValue();
12826 }
12827
12828 if (!IsV1Zeroable) {
12829 // If V1 can't be treated as a zero vector we have fewer options to lower
12830 // this. We can't support integer vectors or non-zero targets cheaply.
12831 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12832 if (!VT.isFloatingPoint() || V2Index != 0)
12833 return SDValue();
12834 if (!VT.is128BitVector())
12835 return SDValue();
12836
12837 // Otherwise, use MOVSD, MOVSS or MOVSH.
12838 unsigned MovOpc = 0;
12839 if (EltVT == MVT::f16)
12840 MovOpc = X86ISD::MOVSH;
12841 else if (EltVT == MVT::f32)
12842 MovOpc = X86ISD::MOVSS;
12843 else if (EltVT == MVT::f64)
12844 MovOpc = X86ISD::MOVSD;
12845 else
12846 llvm_unreachable("Unsupported floating point element type to handle!");
12847 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12848 }
12849
12850 // This lowering only works for the low element with floating point vectors.
12851 if (VT.isFloatingPoint() && V2Index != 0)
12852 return SDValue();
12853
12854 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12855 if (ExtVT != VT)
12856 V2 = DAG.getBitcast(VT, V2);
12857
12858 if (V2Index != 0) {
12859 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12860 // the desired position. Otherwise it is more efficient to do a vector
12861 // shift left. We know that we can do a vector shift left because all
12862 // the inputs are zero.
12863 if (VT.isFloatingPoint() || NumElts <= 4) {
12864 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12865 V2Shuffle[V2Index] = 0;
12866 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12867 } else {
12868 V2 = DAG.getBitcast(MVT::v16i8, V2);
12869 V2 = DAG.getNode(
12870 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12871 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12872 V2 = DAG.getBitcast(VT, V2);
12873 }
12874 }
12875 return V2;
12876}
12877
12878/// Try to lower broadcast of a single - truncated - integer element,
12879/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12880///
12881/// This assumes we have AVX2.
12883 int BroadcastIdx,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(Subtarget.hasAVX2() &&
12887 "We can only lower integer broadcasts with AVX2!");
12888
12889 MVT EltVT = VT.getVectorElementType();
12890 MVT V0VT = V0.getSimpleValueType();
12891
12892 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12893 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12894
12895 MVT V0EltVT = V0VT.getVectorElementType();
12896 if (!V0EltVT.isInteger())
12897 return SDValue();
12898
12899 const unsigned EltSize = EltVT.getSizeInBits();
12900 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12901
12902 // This is only a truncation if the original element type is larger.
12903 if (V0EltSize <= EltSize)
12904 return SDValue();
12905
12906 assert(((V0EltSize % EltSize) == 0) &&
12907 "Scalar type sizes must all be powers of 2 on x86!");
12908
12909 const unsigned V0Opc = V0.getOpcode();
12910 const unsigned Scale = V0EltSize / EltSize;
12911 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12912
12913 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12914 V0Opc != ISD::BUILD_VECTOR)
12915 return SDValue();
12916
12917 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12918
12919 // If we're extracting non-least-significant bits, shift so we can truncate.
12920 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12921 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12922 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12923 if (const int OffsetIdx = BroadcastIdx % Scale)
12924 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12925 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12926
12927 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12928 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12929}
12930
12931/// Test whether this can be lowered with a single SHUFPS instruction.
12932///
12933/// This is used to disable more specialized lowerings when the shufps lowering
12934/// will happen to be efficient.
12936 // This routine only handles 128-bit shufps.
12937 assert(Mask.size() == 4 && "Unsupported mask size!");
12938 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12939 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12940 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12941 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12942
12943 // To lower with a single SHUFPS we need to have the low half and high half
12944 // each requiring a single input.
12945 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12946 return false;
12947 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12948 return false;
12949
12950 return true;
12951}
12952
12953/// Test whether the specified input (0 or 1) is in-place blended by the
12954/// given mask.
12955///
12956/// This returns true if the elements from a particular input are already in the
12957/// slot required by the given mask and require no permutation.
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12963 return false;
12964
12965 return true;
12966}
12967
12968/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12969/// the given mask.
12970///
12972 int BroadcastableElement = 0) {
12973 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12974 int Size = Mask.size();
12975 for (int i = 0; i < Size; ++i)
12976 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12977 Mask[i] % Size != BroadcastableElement)
12978 return false;
12979 return true;
12980}
12981
12982/// If we are extracting two 128-bit halves of a vector and shuffling the
12983/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12984/// multi-shuffle lowering.
12986 SDValue N1, ArrayRef<int> Mask,
12987 SelectionDAG &DAG) {
12988 MVT VT = N0.getSimpleValueType();
12989 assert((VT.is128BitVector() &&
12990 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12991 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12992
12993 // Check that both sources are extracts of the same source vector.
12994 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12996 N0.getOperand(0) != N1.getOperand(0) ||
12997 !N0.hasOneUse() || !N1.hasOneUse())
12998 return SDValue();
12999
13000 SDValue WideVec = N0.getOperand(0);
13001 MVT WideVT = WideVec.getSimpleValueType();
13002 if (!WideVT.is256BitVector())
13003 return SDValue();
13004
13005 // Match extracts of each half of the wide source vector. Commute the shuffle
13006 // if the extract of the low half is N1.
13007 unsigned NumElts = VT.getVectorNumElements();
13008 SmallVector<int, 4> NewMask(Mask);
13009 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13010 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13011 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13013 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13014 return SDValue();
13015
13016 // Final bailout: if the mask is simple, we are better off using an extract
13017 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13018 // because that avoids a constant load from memory.
13019 if (NumElts == 4 &&
13020 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13021 return SDValue();
13022
13023 // Extend the shuffle mask with undef elements.
13024 NewMask.append(NumElts, -1);
13025
13026 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13027 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13028 NewMask);
13029 // This is free: ymm -> xmm.
13030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13031 DAG.getVectorIdxConstant(0, DL));
13032}
13033
13034/// Try to lower broadcast of a single element.
13035///
13036/// For convenience, this code also bundles all of the subtarget feature set
13037/// filtering. While a little annoying to re-dispatch on type here, there isn't
13038/// a convenient way to factor it out.
13040 SDValue V2, ArrayRef<int> Mask,
13041 const X86Subtarget &Subtarget,
13042 SelectionDAG &DAG) {
13043 MVT EltVT = VT.getVectorElementType();
13044 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13045 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13046 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13047 return SDValue();
13048
13049 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13050 // we can only broadcast from a register with AVX2.
13051 unsigned NumEltBits = VT.getScalarSizeInBits();
13052 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13055 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13056
13057 // Check that the mask is a broadcast.
13058 int BroadcastIdx = getSplatIndex(Mask);
13059 if (BroadcastIdx < 0) {
13060 // Check for hidden broadcast.
13061 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13062 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13063 return SDValue();
13064 BroadcastIdx = 0;
13065 }
13066 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13067 "a sorted mask where the broadcast "
13068 "comes from V1.");
13069 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13070
13071 // Go up the chain of (vector) values to find a scalar load that we can
13072 // combine with the broadcast.
13073 // TODO: Combine this logic with findEltLoadSrc() used by
13074 // EltsFromConsecutiveLoads().
13075 int BitOffset = BroadcastIdx * NumEltBits;
13076 SDValue V = V1;
13077 for (;;) {
13078 switch (V.getOpcode()) {
13079 case ISD::BITCAST: {
13080 V = V.getOperand(0);
13081 continue;
13082 }
13083 case ISD::CONCAT_VECTORS: {
13084 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13085 int OpIdx = BitOffset / OpBitWidth;
13086 V = V.getOperand(OpIdx);
13087 BitOffset %= OpBitWidth;
13088 continue;
13089 }
13091 // The extraction index adds to the existing offset.
13092 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13093 unsigned Idx = V.getConstantOperandVal(1);
13094 unsigned BeginOffset = Idx * EltBitWidth;
13095 BitOffset += BeginOffset;
13096 V = V.getOperand(0);
13097 continue;
13098 }
13099 case ISD::INSERT_SUBVECTOR: {
13100 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13101 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13102 int Idx = (int)V.getConstantOperandVal(2);
13103 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13104 int BeginOffset = Idx * EltBitWidth;
13105 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13106 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13107 BitOffset -= BeginOffset;
13108 V = VInner;
13109 } else {
13110 V = VOuter;
13111 }
13112 continue;
13113 }
13114 }
13115 break;
13116 }
13117 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13118 BroadcastIdx = BitOffset / NumEltBits;
13119
13120 // Do we need to bitcast the source to retrieve the original broadcast index?
13121 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13122
13123 // Check if this is a broadcast of a scalar. We special case lowering
13124 // for scalars so that we can more effectively fold with loads.
13125 // If the original value has a larger element type than the shuffle, the
13126 // broadcast element is in essence truncated. Make that explicit to ease
13127 // folding.
13128 if (BitCastSrc && VT.isInteger())
13129 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13130 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13131 return TruncBroadcast;
13132
13133 // Also check the simpler case, where we can directly reuse the scalar.
13134 if (!BitCastSrc &&
13135 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13136 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13137 V = V.getOperand(BroadcastIdx);
13138
13139 // If we can't broadcast from a register, check that the input is a load.
13140 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13141 return SDValue();
13142 } else if (ISD::isNormalLoad(V.getNode()) &&
13143 cast<LoadSDNode>(V)->isSimple()) {
13144 // We do not check for one-use of the vector load because a broadcast load
13145 // is expected to be a win for code size, register pressure, and possibly
13146 // uops even if the original vector load is not eliminated.
13147
13148 // Reduce the vector load and shuffle to a broadcasted scalar load.
13149 auto *Ld = cast<LoadSDNode>(V);
13150 SDValue BaseAddr = Ld->getBasePtr();
13151 MVT SVT = VT.getScalarType();
13152 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13153 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13154 SDValue NewAddr =
13156
13157 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13158 // than MOVDDUP.
13159 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13160 if (Opcode == X86ISD::VBROADCAST) {
13161 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13162 SDValue Ops[] = {Ld->getChain(), NewAddr};
13163 V = DAG.getMemIntrinsicNode(
13164 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13166 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13168 return DAG.getBitcast(VT, V);
13169 }
13170 assert(SVT == MVT::f64 && "Unexpected VT!");
13171 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13173 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13175 } else if (!BroadcastFromReg) {
13176 // We can't broadcast from a vector register.
13177 return SDValue();
13178 } else if (BitOffset != 0) {
13179 // We can only broadcast from the zero-element of a vector register,
13180 // but it can be advantageous to broadcast from the zero-element of a
13181 // subvector.
13182 if (!VT.is256BitVector() && !VT.is512BitVector())
13183 return SDValue();
13184
13185 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13186 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13187 return SDValue();
13188
13189 // If we are broadcasting an element from the lowest 128-bit subvector, try
13190 // to move the element in position.
13191 if (BitOffset < 128 && NumActiveElts > 1 &&
13192 V.getScalarValueSizeInBits() == NumEltBits) {
13193 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13194 "Unexpected bit-offset");
13195 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13196 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13197 V = extractSubVector(V, 0, DAG, DL, 128);
13198 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13199 } else {
13200 // Only broadcast the zero-element of a 128-bit subvector.
13201 if ((BitOffset % 128) != 0)
13202 return SDValue();
13203
13204 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13205 "Unexpected bit-offset");
13206 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13207 "Unexpected vector size");
13208 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13209 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13210 }
13211 }
13212
13213 // On AVX we can use VBROADCAST directly for scalar sources.
13214 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13215 V = DAG.getBitcast(MVT::f64, V);
13216 if (Subtarget.hasAVX()) {
13217 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13218 return DAG.getBitcast(VT, V);
13219 }
13220 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13221 }
13222
13223 // If this is a scalar, do the broadcast on this type and bitcast.
13224 if (!V.getValueType().isVector()) {
13225 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13226 "Unexpected scalar size");
13227 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13229 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13230 }
13231
13232 // We only support broadcasting from 128-bit vectors to minimize the
13233 // number of patterns we need to deal with in isel. So extract down to
13234 // 128-bits, removing as many bitcasts as possible.
13235 if (V.getValueSizeInBits() > 128)
13237
13238 // Otherwise cast V to a vector with the same element type as VT, but
13239 // possibly narrower than VT. Then perform the broadcast.
13240 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13241 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13242 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13243}
13244
13245// Check for whether we can use INSERTPS to perform the shuffle. We only use
13246// INSERTPS when the V1 elements are already in the correct locations
13247// because otherwise we can just always use two SHUFPS instructions which
13248// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13249// perform INSERTPS if a single V1 element is out of place and all V2
13250// elements are zeroable.
13252 unsigned &InsertPSMask,
13253 const APInt &Zeroable,
13254 ArrayRef<int> Mask, SelectionDAG &DAG) {
13255 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13257 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13258
13259 // Attempt to match INSERTPS with one element from VA or VB being
13260 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13261 // are updated.
13262 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13263 ArrayRef<int> CandidateMask) {
13264 unsigned ZMask = 0;
13265 int VADstIndex = -1;
13266 int VBDstIndex = -1;
13267 bool VAUsedInPlace = false;
13268
13269 for (int i = 0; i < 4; ++i) {
13270 // Synthesize a zero mask from the zeroable elements (includes undefs).
13271 if (Zeroable[i]) {
13272 ZMask |= 1 << i;
13273 continue;
13274 }
13275
13276 // Flag if we use any VA inputs in place.
13277 if (i == CandidateMask[i]) {
13278 VAUsedInPlace = true;
13279 continue;
13280 }
13281
13282 // We can only insert a single non-zeroable element.
13283 if (VADstIndex >= 0 || VBDstIndex >= 0)
13284 return false;
13285
13286 if (CandidateMask[i] < 4) {
13287 // VA input out of place for insertion.
13288 VADstIndex = i;
13289 } else {
13290 // VB input for insertion.
13291 VBDstIndex = i;
13292 }
13293 }
13294
13295 // Don't bother if we have no (non-zeroable) element for insertion.
13296 if (VADstIndex < 0 && VBDstIndex < 0)
13297 return false;
13298
13299 // Determine element insertion src/dst indices. The src index is from the
13300 // start of the inserted vector, not the start of the concatenated vector.
13301 unsigned VBSrcIndex = 0;
13302 if (VADstIndex >= 0) {
13303 // If we have a VA input out of place, we use VA as the V2 element
13304 // insertion and don't use the original V2 at all.
13305 VBSrcIndex = CandidateMask[VADstIndex];
13306 VBDstIndex = VADstIndex;
13307 VB = VA;
13308 } else {
13309 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13310 }
13311
13312 // If no V1 inputs are used in place, then the result is created only from
13313 // the zero mask and the V2 insertion - so remove V1 dependency.
13314 if (!VAUsedInPlace)
13315 VA = DAG.getUNDEF(MVT::v4f32);
13316
13317 // Update V1, V2 and InsertPSMask accordingly.
13318 V1 = VA;
13319 V2 = VB;
13320
13321 // Insert the V2 element into the desired position.
13322 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13323 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13324 return true;
13325 };
13326
13327 if (matchAsInsertPS(V1, V2, Mask))
13328 return true;
13329
13330 // Commute and try again.
13331 SmallVector<int, 4> CommutedMask(Mask);
13333 if (matchAsInsertPS(V2, V1, CommutedMask))
13334 return true;
13335
13336 return false;
13337}
13338
13340 ArrayRef<int> Mask, const APInt &Zeroable,
13341 SelectionDAG &DAG) {
13342 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13344
13345 // Attempt to match the insertps pattern.
13346 unsigned InsertPSMask = 0;
13347 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13348 return SDValue();
13349
13350 // Insert the V2 element into the desired position.
13351 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13352 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13353}
13354
13355/// Handle lowering of 2-lane 64-bit floating point shuffles.
13356///
13357/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13358/// support for floating point shuffles but not integer shuffles. These
13359/// instructions will incur a domain crossing penalty on some chips though so
13360/// it is better to avoid lowering through this for integer vectors where
13361/// possible.
13363 const APInt &Zeroable, SDValue V1, SDValue V2,
13364 const X86Subtarget &Subtarget,
13365 SelectionDAG &DAG) {
13366 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13368 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13369
13370 if (V2.isUndef()) {
13371 // Check for being able to broadcast a single element.
13372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13373 Mask, Subtarget, DAG))
13374 return Broadcast;
13375
13376 // Straight shuffle of a single input vector. Simulate this by using the
13377 // single input as both of the "inputs" to this instruction..
13378 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13379
13380 if (Subtarget.hasAVX()) {
13381 // If we have AVX, we can use VPERMILPS which will allow folding a load
13382 // into the shuffle.
13383 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13385 }
13386
13387 return DAG.getNode(
13388 X86ISD::SHUFP, DL, MVT::v2f64,
13389 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13391 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13392 }
13393 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13395 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13396 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13397
13398 if (Subtarget.hasAVX2())
13399 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13400 return Extract;
13401
13402 // When loading a scalar and then shuffling it into a vector we can often do
13403 // the insertion cheaply.
13405 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13406 return Insertion;
13407 // Try inverting the insertion since for v2 masks it is easy to do and we
13408 // can't reliably sort the mask one way or the other.
13409 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13410 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13412 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // Try to use one of the special instruction patterns to handle two common
13416 // blend patterns if a zero-blend above didn't work.
13417 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13418 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13419 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13420 // We can either use a special instruction to load over the low double or
13421 // to move just the low double.
13422 return DAG.getNode(
13423 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13424 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13425
13426 if (Subtarget.hasSSE41())
13427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13429 return Blend;
13430
13431 // Use dedicated unpack instructions for masks that match their pattern.
13432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13433 return V;
13434
13435 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13436 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13438}
13439
13440/// Handle lowering of 2-lane 64-bit integer shuffles.
13441///
13442/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13443/// the integer unit to minimize domain crossing penalties. However, for blends
13444/// it falls back to the floating point shuffle operation with appropriate bit
13445/// casting.
13447 const APInt &Zeroable, SDValue V1, SDValue V2,
13448 const X86Subtarget &Subtarget,
13449 SelectionDAG &DAG) {
13450 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13452 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13453
13454 if (V2.isUndef()) {
13455 // Check for being able to broadcast a single element.
13456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13457 Mask, Subtarget, DAG))
13458 return Broadcast;
13459
13460 // Straight shuffle of a single input vector. For everything from SSE2
13461 // onward this has a single fast instruction with no scary immediates.
13462 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13463 V1 = DAG.getBitcast(MVT::v4i32, V1);
13464 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13465 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13466 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13467 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13468 return DAG.getBitcast(
13469 MVT::v2i64,
13470 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13471 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13472 }
13473 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13475 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13476 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13477
13478 if (Subtarget.hasAVX2())
13479 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13480 return Extract;
13481
13482 // Try to use shift instructions.
13483 if (SDValue Shift =
13484 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13485 DAG, /*BitwiseOnly*/ false))
13486 return Shift;
13487
13488 // When loading a scalar and then shuffling it into a vector we can often do
13489 // the insertion cheaply.
13491 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13492 return Insertion;
13493 // Try inverting the insertion since for v2 masks it is easy to do and we
13494 // can't reliably sort the mask one way or the other.
13495 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13497 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499
13500 // We have different paths for blend lowering, but they all must use the
13501 // *exact* same predicate.
13502 bool IsBlendSupported = Subtarget.hasSSE41();
13503 if (IsBlendSupported)
13504 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13505 Zeroable, Subtarget, DAG))
13506 return Blend;
13507
13508 // Use dedicated unpack instructions for masks that match their pattern.
13509 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13510 return V;
13511
13512 // Try to use byte rotation instructions.
13513 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13514 if (Subtarget.hasSSSE3()) {
13515 if (Subtarget.hasVLX())
13516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13517 Zeroable, Subtarget, DAG))
13518 return Rotate;
13519
13520 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13521 Subtarget, DAG))
13522 return Rotate;
13523 }
13524
13525 // If we have direct support for blends, we should lower by decomposing into
13526 // a permute. That will be faster than the domain cross.
13527 if (IsBlendSupported)
13528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13529 Zeroable, Subtarget, DAG);
13530
13531 // We implement this with SHUFPD which is pretty lame because it will likely
13532 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13533 // However, all the alternatives are still more cycles and newer chips don't
13534 // have this problem. It would be really nice if x86 had better shuffles here.
13535 V1 = DAG.getBitcast(MVT::v2f64, V1);
13536 V2 = DAG.getBitcast(MVT::v2f64, V2);
13537 return DAG.getBitcast(MVT::v2i64,
13538 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13539}
13540
13541/// Lower a vector shuffle using the SHUFPS instruction.
13542///
13543/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13544/// It makes no assumptions about whether this is the *best* lowering, it simply
13545/// uses it.
13547 ArrayRef<int> Mask, SDValue V1,
13548 SDValue V2, SelectionDAG &DAG) {
13549 SDValue LowV = V1, HighV = V2;
13550 SmallVector<int, 4> NewMask(Mask);
13551 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13552
13553 if (NumV2Elements == 1) {
13554 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13555
13556 // Compute the index adjacent to V2Index and in the same half by toggling
13557 // the low bit.
13558 int V2AdjIndex = V2Index ^ 1;
13559
13560 if (Mask[V2AdjIndex] < 0) {
13561 // Handles all the cases where we have a single V2 element and an undef.
13562 // This will only ever happen in the high lanes because we commute the
13563 // vector otherwise.
13564 if (V2Index < 2)
13565 std::swap(LowV, HighV);
13566 NewMask[V2Index] -= 4;
13567 } else {
13568 // Handle the case where the V2 element ends up adjacent to a V1 element.
13569 // To make this work, blend them together as the first step.
13570 int V1Index = V2AdjIndex;
13571 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13572 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13573 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13574
13575 // Now proceed to reconstruct the final blend as we have the necessary
13576 // high or low half formed.
13577 if (V2Index < 2) {
13578 LowV = V2;
13579 HighV = V1;
13580 } else {
13581 HighV = V2;
13582 }
13583 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13584 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13585 }
13586 } else if (NumV2Elements == 2) {
13587 if (Mask[0] < 4 && Mask[1] < 4) {
13588 // Handle the easy case where we have V1 in the low lanes and V2 in the
13589 // high lanes.
13590 NewMask[2] -= 4;
13591 NewMask[3] -= 4;
13592 } else if (Mask[2] < 4 && Mask[3] < 4) {
13593 // We also handle the reversed case because this utility may get called
13594 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13595 // arrange things in the right direction.
13596 NewMask[0] -= 4;
13597 NewMask[1] -= 4;
13598 HighV = V1;
13599 LowV = V2;
13600 } else {
13601 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13602 // trying to place elements directly, just blend them and set up the final
13603 // shuffle to place them.
13604
13605 // The first two blend mask elements are for V1, the second two are for
13606 // V2.
13607 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13608 Mask[2] < 4 ? Mask[2] : Mask[3],
13609 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13610 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13611 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13612 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13613
13614 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13615 // a blend.
13616 LowV = HighV = V1;
13617 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13618 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13619 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13620 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13621 }
13622 } else if (NumV2Elements == 3) {
13623 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13624 // we can get here due to other paths (e.g repeated mask matching) that we
13625 // don't want to do another round of lowerVECTOR_SHUFFLE.
13627 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13628 }
13629 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13630 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13631}
13632
13633/// Lower 4-lane 32-bit floating point shuffles.
13634///
13635/// Uses instructions exclusively from the floating point unit to minimize
13636/// domain crossing penalties, as these are sufficient to implement all v4f32
13637/// shuffles.
13639 const APInt &Zeroable, SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13645
13646 if (Subtarget.hasSSE41())
13647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13648 Zeroable, Subtarget, DAG))
13649 return Blend;
13650
13651 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13652
13653 if (NumV2Elements == 0) {
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13656 Mask, Subtarget, DAG))
13657 return Broadcast;
13658
13659 // Use even/odd duplicate instructions for masks that match their pattern.
13660 if (Subtarget.hasSSE3()) {
13661 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13662 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13663 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13665 }
13666
13667 if (Subtarget.hasAVX()) {
13668 // If we have AVX, we can use VPERMILPS which will allow folding a load
13669 // into the shuffle.
13670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13671 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13672 }
13673
13674 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13675 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13676 if (!Subtarget.hasSSE2()) {
13677 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13678 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13679 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13680 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13681 }
13682
13683 // Otherwise, use a straight shuffle of a single input vector. We pass the
13684 // input vector to both operands to simulate this with a SHUFPS.
13685 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13687 }
13688
13689 if (Subtarget.hasSSE2())
13691 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13692 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13693 return ZExt;
13694 }
13695
13696 if (Subtarget.hasAVX2())
13697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13698 return Extract;
13699
13700 // There are special ways we can lower some single-element blends. However, we
13701 // have custom ways we can lower more complex single-element blends below that
13702 // we defer to if both this and BLENDPS fail to match, so restrict this to
13703 // when the V2 input is targeting element 0 of the mask -- that is the fast
13704 // case here.
13705 if (NumV2Elements == 1 && Mask[0] >= 4)
13707 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13708 return V;
13709
13710 if (Subtarget.hasSSE41()) {
13711 // Use INSERTPS if we can complete the shuffle efficiently.
13712 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13713 return V;
13714
13715 if (!isSingleSHUFPSMask(Mask))
13716 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13717 V2, Mask, DAG))
13718 return BlendPerm;
13719 }
13720
13721 // Use low/high mov instructions. These are only valid in SSE1 because
13722 // otherwise they are widened to v2f64 and never get here.
13723 if (!Subtarget.hasSSE2()) {
13724 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13725 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13726 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13727 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13728 }
13729
13730 // Use dedicated unpack instructions for masks that match their pattern.
13731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13732 return V;
13733
13734 // Otherwise fall back to a SHUFPS lowering strategy.
13735 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13736}
13737
13738/// Lower 4-lane i32 vector shuffles.
13739///
13740/// We try to handle these with integer-domain shuffles where we can, but for
13741/// blends we use the floating point domain blend instructions.
13743 const APInt &Zeroable, SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749
13750 // Whenever we can lower this as a zext, that instruction is strictly faster
13751 // than any alternative. It also allows us to fold memory operands into the
13752 // shuffle in many cases.
13753 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13754 Zeroable, Subtarget, DAG))
13755 return ZExt;
13756
13757 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13758
13759 // Try to use shift instructions if fast.
13760 if (Subtarget.preferLowerShuffleAsShift()) {
13761 if (SDValue Shift =
13762 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13763 Subtarget, DAG, /*BitwiseOnly*/ true))
13764 return Shift;
13765 if (NumV2Elements == 0)
13766 if (SDValue Rotate =
13767 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13768 return Rotate;
13769 }
13770
13771 if (NumV2Elements == 0) {
13772 // Try to use broadcast unless the mask only has one non-undef element.
13773 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13774 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13775 Mask, Subtarget, DAG))
13776 return Broadcast;
13777 }
13778
13779 // Straight shuffle of a single input vector. For everything from SSE2
13780 // onward this has a single fast instruction with no scary immediates.
13781 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13782 // but we aren't actually going to use the UNPCK instruction because doing
13783 // so prevents folding a load into this instruction or making a copy.
13784 const int UnpackLoMask[] = {0, 0, 1, 1};
13785 const int UnpackHiMask[] = {2, 2, 3, 3};
13786 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13787 Mask = UnpackLoMask;
13788 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13789 Mask = UnpackHiMask;
13790
13791 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13792 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13793 }
13794
13795 if (Subtarget.hasAVX2())
13796 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13797 return Extract;
13798
13799 // Try to use shift instructions.
13800 if (SDValue Shift =
13801 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13802 DAG, /*BitwiseOnly*/ false))
13803 return Shift;
13804
13805 // There are special ways we can lower some single-element blends.
13806 if (NumV2Elements == 1)
13808 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13809 return V;
13810
13811 // We have different paths for blend lowering, but they all must use the
13812 // *exact* same predicate.
13813 bool IsBlendSupported = Subtarget.hasSSE41();
13814 if (IsBlendSupported)
13815 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13816 Zeroable, Subtarget, DAG))
13817 return Blend;
13818
13819 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13820 Zeroable, Subtarget, DAG))
13821 return Masked;
13822
13823 // Use dedicated unpack instructions for masks that match their pattern.
13824 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13825 return V;
13826
13827 // Try to use byte rotation instructions.
13828 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13829 if (Subtarget.hasSSSE3()) {
13830 if (Subtarget.hasVLX())
13831 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13832 Zeroable, Subtarget, DAG))
13833 return Rotate;
13834
13835 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13836 Subtarget, DAG))
13837 return Rotate;
13838 }
13839
13840 // Assume that a single SHUFPS is faster than an alternative sequence of
13841 // multiple instructions (even if the CPU has a domain penalty).
13842 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13843 if (!isSingleSHUFPSMask(Mask)) {
13844 // If we have direct support for blends, we should lower by decomposing into
13845 // a permute. That will be faster than the domain cross.
13846 if (IsBlendSupported)
13847 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13848 Zeroable, Subtarget, DAG);
13849
13850 // Try to lower by permuting the inputs into an unpack instruction.
13851 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13852 Mask, Subtarget, DAG))
13853 return Unpack;
13854 }
13855
13856 // We implement this with SHUFPS because it can blend from two vectors.
13857 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13858 // up the inputs, bypassing domain shift penalties that we would incur if we
13859 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13860 // relevant.
13861 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13862 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13863 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13864 return DAG.getBitcast(MVT::v4i32, ShufPS);
13865}
13866
13867/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13868/// shuffle lowering, and the most complex part.
13869///
13870/// The lowering strategy is to try to form pairs of input lanes which are
13871/// targeted at the same half of the final vector, and then use a dword shuffle
13872/// to place them onto the right half, and finally unpack the paired lanes into
13873/// their final position.
13874///
13875/// The exact breakdown of how to form these dword pairs and align them on the
13876/// correct sides is really tricky. See the comments within the function for
13877/// more of the details.
13878///
13879/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13880/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13881/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13882/// vector, form the analogous 128-bit 8-element Mask.
13884 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13885 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13886 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13887 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13888
13889 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13890 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13891 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13892
13893 // Attempt to directly match PSHUFLW or PSHUFHW.
13894 if (isUndefOrInRange(LoMask, 0, 4) &&
13895 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13896 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13897 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13898 }
13899 if (isUndefOrInRange(HiMask, 4, 8) &&
13900 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13901 for (int i = 0; i != 4; ++i)
13902 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13903 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13905 }
13906
13907 SmallVector<int, 4> LoInputs;
13908 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13909 array_pod_sort(LoInputs.begin(), LoInputs.end());
13910 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13911 SmallVector<int, 4> HiInputs;
13912 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13913 array_pod_sort(HiInputs.begin(), HiInputs.end());
13914 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13915 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13916 int NumHToL = LoInputs.size() - NumLToL;
13917 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13918 int NumHToH = HiInputs.size() - NumLToH;
13919 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13920 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13921 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13922 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13923
13924 // If we are shuffling values from one half - check how many different DWORD
13925 // pairs we need to create. If only 1 or 2 then we can perform this as a
13926 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13927 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13928 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13929 V = DAG.getNode(ShufWOp, DL, VT, V,
13930 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13931 V = DAG.getBitcast(PSHUFDVT, V);
13932 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13933 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13934 return DAG.getBitcast(VT, V);
13935 };
13936
13937 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13938 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13939 SmallVector<std::pair<int, int>, 4> DWordPairs;
13940 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13941
13942 // Collect the different DWORD pairs.
13943 for (int DWord = 0; DWord != 4; ++DWord) {
13944 int M0 = Mask[2 * DWord + 0];
13945 int M1 = Mask[2 * DWord + 1];
13946 M0 = (M0 >= 0 ? M0 % 4 : M0);
13947 M1 = (M1 >= 0 ? M1 % 4 : M1);
13948 if (M0 < 0 && M1 < 0)
13949 continue;
13950
13951 bool Match = false;
13952 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13953 auto &DWordPair = DWordPairs[j];
13954 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13955 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13956 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13957 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13958 PSHUFDMask[DWord] = DOffset + j;
13959 Match = true;
13960 break;
13961 }
13962 }
13963 if (!Match) {
13964 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13965 DWordPairs.push_back(std::make_pair(M0, M1));
13966 }
13967 }
13968
13969 if (DWordPairs.size() <= 2) {
13970 DWordPairs.resize(2, std::make_pair(-1, -1));
13971 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13972 DWordPairs[1].first, DWordPairs[1].second};
13973 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13974 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13975 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13976 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13977 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13978 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13979 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13980 }
13981 if ((NumHToL + NumHToH) == 0)
13982 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13983 if ((NumLToL + NumLToH) == 0)
13984 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13985 }
13986 }
13987
13988 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13989 // such inputs we can swap two of the dwords across the half mark and end up
13990 // with <=2 inputs to each half in each half. Once there, we can fall through
13991 // to the generic code below. For example:
13992 //
13993 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13994 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13995 //
13996 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13997 // and an existing 2-into-2 on the other half. In this case we may have to
13998 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13999 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14000 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14001 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14002 // half than the one we target for fixing) will be fixed when we re-enter this
14003 // path. We will also combine away any sequence of PSHUFD instructions that
14004 // result into a single instruction. Here is an example of the tricky case:
14005 //
14006 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14007 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14008 //
14009 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14010 //
14011 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14012 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14013 //
14014 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14015 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14016 //
14017 // The result is fine to be handled by the generic logic.
14018 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14019 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14020 int AOffset, int BOffset) {
14021 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14022 "Must call this with A having 3 or 1 inputs from the A half.");
14023 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14024 "Must call this with B having 1 or 3 inputs from the B half.");
14025 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14026 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14027
14028 bool ThreeAInputs = AToAInputs.size() == 3;
14029
14030 // Compute the index of dword with only one word among the three inputs in
14031 // a half by taking the sum of the half with three inputs and subtracting
14032 // the sum of the actual three inputs. The difference is the remaining
14033 // slot.
14034 int ADWord = 0, BDWord = 0;
14035 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14036 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14037 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14038 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14039 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14040 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14041 int TripleNonInputIdx =
14042 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14043 TripleDWord = TripleNonInputIdx / 2;
14044
14045 // We use xor with one to compute the adjacent DWord to whichever one the
14046 // OneInput is in.
14047 OneInputDWord = (OneInput / 2) ^ 1;
14048
14049 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14050 // and BToA inputs. If there is also such a problem with the BToB and AToB
14051 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14052 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14053 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14054 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14055 // Compute how many inputs will be flipped by swapping these DWords. We
14056 // need
14057 // to balance this to ensure we don't form a 3-1 shuffle in the other
14058 // half.
14059 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14060 llvm::count(AToBInputs, 2 * ADWord + 1);
14061 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14062 llvm::count(BToBInputs, 2 * BDWord + 1);
14063 if ((NumFlippedAToBInputs == 1 &&
14064 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14065 (NumFlippedBToBInputs == 1 &&
14066 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14067 // We choose whether to fix the A half or B half based on whether that
14068 // half has zero flipped inputs. At zero, we may not be able to fix it
14069 // with that half. We also bias towards fixing the B half because that
14070 // will more commonly be the high half, and we have to bias one way.
14071 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14072 ArrayRef<int> Inputs) {
14073 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14074 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14075 // Determine whether the free index is in the flipped dword or the
14076 // unflipped dword based on where the pinned index is. We use this bit
14077 // in an xor to conditionally select the adjacent dword.
14078 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14079 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14080 if (IsFixIdxInput == IsFixFreeIdxInput)
14081 FixFreeIdx += 1;
14082 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14083 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14084 "We need to be changing the number of flipped inputs!");
14085 int PSHUFHalfMask[] = {0, 1, 2, 3};
14086 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14087 V = DAG.getNode(
14088 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14089 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14090 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14091
14092 for (int &M : Mask)
14093 if (M >= 0 && M == FixIdx)
14094 M = FixFreeIdx;
14095 else if (M >= 0 && M == FixFreeIdx)
14096 M = FixIdx;
14097 };
14098 if (NumFlippedBToBInputs != 0) {
14099 int BPinnedIdx =
14100 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14101 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14102 } else {
14103 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14104 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14105 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14106 }
14107 }
14108 }
14109
14110 int PSHUFDMask[] = {0, 1, 2, 3};
14111 PSHUFDMask[ADWord] = BDWord;
14112 PSHUFDMask[BDWord] = ADWord;
14113 V = DAG.getBitcast(
14114 VT,
14115 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14116 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14117
14118 // Adjust the mask to match the new locations of A and B.
14119 for (int &M : Mask)
14120 if (M >= 0 && M/2 == ADWord)
14121 M = 2 * BDWord + M % 2;
14122 else if (M >= 0 && M/2 == BDWord)
14123 M = 2 * ADWord + M % 2;
14124
14125 // Recurse back into this routine to re-compute state now that this isn't
14126 // a 3 and 1 problem.
14127 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14128 };
14129 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14130 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14131 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14132 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14133
14134 // At this point there are at most two inputs to the low and high halves from
14135 // each half. That means the inputs can always be grouped into dwords and
14136 // those dwords can then be moved to the correct half with a dword shuffle.
14137 // We use at most one low and one high word shuffle to collect these paired
14138 // inputs into dwords, and finally a dword shuffle to place them.
14139 int PSHUFLMask[4] = {-1, -1, -1, -1};
14140 int PSHUFHMask[4] = {-1, -1, -1, -1};
14141 int PSHUFDMask[4] = {-1, -1, -1, -1};
14142
14143 // First fix the masks for all the inputs that are staying in their
14144 // original halves. This will then dictate the targets of the cross-half
14145 // shuffles.
14146 auto fixInPlaceInputs =
14147 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14148 MutableArrayRef<int> SourceHalfMask,
14149 MutableArrayRef<int> HalfMask, int HalfOffset) {
14150 if (InPlaceInputs.empty())
14151 return;
14152 if (InPlaceInputs.size() == 1) {
14153 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14154 InPlaceInputs[0] - HalfOffset;
14155 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14156 return;
14157 }
14158 if (IncomingInputs.empty()) {
14159 // Just fix all of the in place inputs.
14160 for (int Input : InPlaceInputs) {
14161 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14162 PSHUFDMask[Input / 2] = Input / 2;
14163 }
14164 return;
14165 }
14166
14167 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14168 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14169 InPlaceInputs[0] - HalfOffset;
14170 // Put the second input next to the first so that they are packed into
14171 // a dword. We find the adjacent index by toggling the low bit.
14172 int AdjIndex = InPlaceInputs[0] ^ 1;
14173 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14174 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14175 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14176 };
14177 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14178 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14179
14180 // Now gather the cross-half inputs and place them into a free dword of
14181 // their target half.
14182 // FIXME: This operation could almost certainly be simplified dramatically to
14183 // look more like the 3-1 fixing operation.
14184 auto moveInputsToRightHalf = [&PSHUFDMask](
14185 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14186 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14187 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14188 int DestOffset) {
14189 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14190 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14191 };
14192 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14193 int Word) {
14194 int LowWord = Word & ~1;
14195 int HighWord = Word | 1;
14196 return isWordClobbered(SourceHalfMask, LowWord) ||
14197 isWordClobbered(SourceHalfMask, HighWord);
14198 };
14199
14200 if (IncomingInputs.empty())
14201 return;
14202
14203 if (ExistingInputs.empty()) {
14204 // Map any dwords with inputs from them into the right half.
14205 for (int Input : IncomingInputs) {
14206 // If the source half mask maps over the inputs, turn those into
14207 // swaps and use the swapped lane.
14208 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14209 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14210 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14211 Input - SourceOffset;
14212 // We have to swap the uses in our half mask in one sweep.
14213 for (int &M : HalfMask)
14214 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14215 M = Input;
14216 else if (M == Input)
14217 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14218 } else {
14219 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14220 Input - SourceOffset &&
14221 "Previous placement doesn't match!");
14222 }
14223 // Note that this correctly re-maps both when we do a swap and when
14224 // we observe the other side of the swap above. We rely on that to
14225 // avoid swapping the members of the input list directly.
14226 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14227 }
14228
14229 // Map the input's dword into the correct half.
14230 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14231 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14232 else
14233 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14234 Input / 2 &&
14235 "Previous placement doesn't match!");
14236 }
14237
14238 // And just directly shift any other-half mask elements to be same-half
14239 // as we will have mirrored the dword containing the element into the
14240 // same position within that half.
14241 for (int &M : HalfMask)
14242 if (M >= SourceOffset && M < SourceOffset + 4) {
14243 M = M - SourceOffset + DestOffset;
14244 assert(M >= 0 && "This should never wrap below zero!");
14245 }
14246 return;
14247 }
14248
14249 // Ensure we have the input in a viable dword of its current half. This
14250 // is particularly tricky because the original position may be clobbered
14251 // by inputs being moved and *staying* in that half.
14252 if (IncomingInputs.size() == 1) {
14253 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14254 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14255 SourceOffset;
14256 SourceHalfMask[InputFixed - SourceOffset] =
14257 IncomingInputs[0] - SourceOffset;
14258 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14259 IncomingInputs[0] = InputFixed;
14260 }
14261 } else if (IncomingInputs.size() == 2) {
14262 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14263 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14264 // We have two non-adjacent or clobbered inputs we need to extract from
14265 // the source half. To do this, we need to map them into some adjacent
14266 // dword slot in the source mask.
14267 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14268 IncomingInputs[1] - SourceOffset};
14269
14270 // If there is a free slot in the source half mask adjacent to one of
14271 // the inputs, place the other input in it. We use (Index XOR 1) to
14272 // compute an adjacent index.
14273 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14274 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14275 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14276 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14277 InputsFixed[1] = InputsFixed[0] ^ 1;
14278 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14279 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14280 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14281 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14282 InputsFixed[0] = InputsFixed[1] ^ 1;
14283 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14284 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14285 // The two inputs are in the same DWord but it is clobbered and the
14286 // adjacent DWord isn't used at all. Move both inputs to the free
14287 // slot.
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14289 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14290 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14291 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14292 } else {
14293 // The only way we hit this point is if there is no clobbering
14294 // (because there are no off-half inputs to this half) and there is no
14295 // free slot adjacent to one of the inputs. In this case, we have to
14296 // swap an input with a non-input.
14297 for (int i = 0; i < 4; ++i)
14298 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14299 "We can't handle any clobbers here!");
14300 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14301 "Cannot have adjacent inputs here!");
14302
14303 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14304 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14305
14306 // We also have to update the final source mask in this case because
14307 // it may need to undo the above swap.
14308 for (int &M : FinalSourceHalfMask)
14309 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14310 M = InputsFixed[1] + SourceOffset;
14311 else if (M == InputsFixed[1] + SourceOffset)
14312 M = (InputsFixed[0] ^ 1) + SourceOffset;
14313
14314 InputsFixed[1] = InputsFixed[0] ^ 1;
14315 }
14316
14317 // Point everything at the fixed inputs.
14318 for (int &M : HalfMask)
14319 if (M == IncomingInputs[0])
14320 M = InputsFixed[0] + SourceOffset;
14321 else if (M == IncomingInputs[1])
14322 M = InputsFixed[1] + SourceOffset;
14323
14324 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14325 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14326 }
14327 } else {
14328 llvm_unreachable("Unhandled input size!");
14329 }
14330
14331 // Now hoist the DWord down to the right half.
14332 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14333 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14334 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14335 for (int &M : HalfMask)
14336 for (int Input : IncomingInputs)
14337 if (M == Input)
14338 M = FreeDWord * 2 + Input % 2;
14339 };
14340 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14341 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14342 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14343 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14344
14345 // Now enact all the shuffles we've computed to move the inputs into their
14346 // target half.
14347 if (!isNoopShuffleMask(PSHUFLMask))
14348 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14349 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14350 if (!isNoopShuffleMask(PSHUFHMask))
14351 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14352 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14353 if (!isNoopShuffleMask(PSHUFDMask))
14354 V = DAG.getBitcast(
14355 VT,
14356 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14357 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14358
14359 // At this point, each half should contain all its inputs, and we can then
14360 // just shuffle them into their final position.
14361 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14362 "Failed to lift all the high half inputs to the low mask!");
14363 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14364 "Failed to lift all the low half inputs to the high mask!");
14365
14366 // Do a half shuffle for the low mask.
14367 if (!isNoopShuffleMask(LoMask))
14368 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14369 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14370
14371 // Do a half shuffle with the high mask after shifting its values down.
14372 for (int &M : HiMask)
14373 if (M >= 0)
14374 M -= 4;
14375 if (!isNoopShuffleMask(HiMask))
14376 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14377 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14378
14379 return V;
14380}
14381
14382/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14383/// blend if only one input is used.
14385 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14386 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14388 "Lane crossing shuffle masks not supported");
14389
14390 int NumBytes = VT.getSizeInBits() / 8;
14391 int Size = Mask.size();
14392 int Scale = NumBytes / Size;
14393
14394 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14396 V1InUse = false;
14397 V2InUse = false;
14398
14399 for (int i = 0; i < NumBytes; ++i) {
14400 int M = Mask[i / Scale];
14401 if (M < 0)
14402 continue;
14403
14404 const int ZeroMask = 0x80;
14405 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14406 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14407 if (Zeroable[i / Scale])
14408 V1Idx = V2Idx = ZeroMask;
14409
14410 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14411 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14412 V1InUse |= (ZeroMask != V1Idx);
14413 V2InUse |= (ZeroMask != V2Idx);
14414 }
14415
14416 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14417 if (V1InUse)
14418 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14419 DAG.getBuildVector(ShufVT, DL, V1Mask));
14420 if (V2InUse)
14421 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14422 DAG.getBuildVector(ShufVT, DL, V2Mask));
14423
14424 // If we need shuffled inputs from both, blend the two.
14425 SDValue V;
14426 if (V1InUse && V2InUse)
14427 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14428 else
14429 V = V1InUse ? V1 : V2;
14430
14431 // Cast the result back to the correct type.
14432 return DAG.getBitcast(VT, V);
14433}
14434
14435/// Generic lowering of 8-lane i16 shuffles.
14436///
14437/// This handles both single-input shuffles and combined shuffle/blends with
14438/// two inputs. The single input shuffles are immediately delegated to
14439/// a dedicated lowering routine.
14440///
14441/// The blends are lowered in one of three fundamental ways. If there are few
14442/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14443/// of the input is significantly cheaper when lowered as an interleaving of
14444/// the two inputs, try to interleave them. Otherwise, blend the low and high
14445/// halves of the inputs separately (making them have relatively few inputs)
14446/// and then concatenate them.
14448 const APInt &Zeroable, SDValue V1, SDValue V2,
14449 const X86Subtarget &Subtarget,
14450 SelectionDAG &DAG) {
14451 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14453 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14454
14455 // Whenever we can lower this as a zext, that instruction is strictly faster
14456 // than any alternative.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14467
14468 if (NumV2Inputs == 0) {
14469 // Try to use shift instructions.
14470 if (SDValue Shift =
14471 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14472 Subtarget, DAG, /*BitwiseOnly*/ false))
14473 return Shift;
14474
14475 // Check for being able to broadcast a single element.
14476 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14477 Mask, Subtarget, DAG))
14478 return Broadcast;
14479
14480 // Try to use bit rotation instructions.
14481 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14482 Subtarget, DAG))
14483 return Rotate;
14484
14485 // Use dedicated unpack instructions for masks that match their pattern.
14486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14487 return V;
14488
14489 // Use dedicated pack instructions for masks that match their pattern.
14490 if (SDValue V =
14491 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14492 return V;
14493
14494 // Try to use byte rotation instructions.
14495 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14496 Subtarget, DAG))
14497 return Rotate;
14498
14499 // Make a copy of the mask so it can be modified.
14500 SmallVector<int, 8> MutableMask(Mask);
14501 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14502 Subtarget, DAG);
14503 }
14504
14505 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14506 "All single-input shuffles should be canonicalized to be V1-input "
14507 "shuffles.");
14508
14509 // Try to use shift instructions.
14510 if (SDValue Shift =
14511 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14512 DAG, /*BitwiseOnly*/ false))
14513 return Shift;
14514
14515 // See if we can use SSE4A Extraction / Insertion.
14516 if (Subtarget.hasSSE4A())
14517 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14518 Zeroable, DAG))
14519 return V;
14520
14521 // There are special ways we can lower some single-element blends.
14522 if (NumV2Inputs == 1)
14524 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14525 return V;
14526
14527 // We have different paths for blend lowering, but they all must use the
14528 // *exact* same predicate.
14529 bool IsBlendSupported = Subtarget.hasSSE41();
14530 if (IsBlendSupported)
14531 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14532 Zeroable, Subtarget, DAG))
14533 return Blend;
14534
14535 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14536 Zeroable, Subtarget, DAG))
14537 return Masked;
14538
14539 // Use dedicated unpack instructions for masks that match their pattern.
14540 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14541 return V;
14542
14543 // Use dedicated pack instructions for masks that match their pattern.
14544 if (SDValue V =
14545 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14546 return V;
14547
14548 // Try to use lower using a truncation.
14549 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14550 Subtarget, DAG))
14551 return V;
14552
14553 // Try to use byte rotation instructions.
14554 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14555 Subtarget, DAG))
14556 return Rotate;
14557
14558 if (SDValue BitBlend =
14559 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14560 return BitBlend;
14561
14562 // Try to use byte shift instructions to mask.
14563 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14564 Zeroable, Subtarget, DAG))
14565 return V;
14566
14567 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14568 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14569 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14570 !Subtarget.hasVLX()) {
14571 // Check if this is part of a 256-bit vector truncation.
14572 unsigned PackOpc = 0;
14573 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14576 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14577 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14578 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14579 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14580 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14581 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14582 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14583 PackOpc = X86ISD::PACKUS;
14584 } else if (Subtarget.hasSSE41()) {
14585 SmallVector<SDValue, 4> DWordClearOps(4,
14586 DAG.getConstant(0, DL, MVT::i32));
14587 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14588 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14589 SDValue DWordClearMask =
14590 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14591 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14592 DWordClearMask);
14593 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14594 DWordClearMask);
14595 PackOpc = X86ISD::PACKUS;
14596 } else if (!Subtarget.hasSSSE3()) {
14597 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14598 V1 = DAG.getBitcast(MVT::v4i32, V1);
14599 V2 = DAG.getBitcast(MVT::v4i32, V2);
14600 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14601 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14602 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14603 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14604 PackOpc = X86ISD::PACKSS;
14605 }
14606 if (PackOpc) {
14607 // Now pack things back together.
14608 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14609 if (NumEvenDrops == 2) {
14610 Result = DAG.getBitcast(MVT::v4i32, Result);
14611 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14612 }
14613 return Result;
14614 }
14615 }
14616
14617 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14618 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14619 if (NumOddDrops == 1) {
14620 bool HasSSE41 = Subtarget.hasSSE41();
14621 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14622 DAG.getBitcast(MVT::v4i32, V1),
14623 DAG.getTargetConstant(16, DL, MVT::i8));
14624 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14625 DAG.getBitcast(MVT::v4i32, V2),
14626 DAG.getTargetConstant(16, DL, MVT::i8));
14627 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14628 MVT::v8i16, V1, V2);
14629 }
14630
14631 // Try to lower by permuting the inputs into an unpack instruction.
14632 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14633 Mask, Subtarget, DAG))
14634 return Unpack;
14635
14636 // If we can't directly blend but can use PSHUFB, that will be better as it
14637 // can both shuffle and set up the inefficient blend.
14638 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14639 bool V1InUse, V2InUse;
14640 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14641 Zeroable, DAG, V1InUse, V2InUse);
14642 }
14643
14644 // We can always bit-blend if we have to so the fallback strategy is to
14645 // decompose into single-input permutes and blends/unpacks.
14646 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14647 Zeroable, Subtarget, DAG);
14648}
14649
14650/// Lower 8-lane 16-bit floating point shuffles.
14652 const APInt &Zeroable, SDValue V1, SDValue V2,
14653 const X86Subtarget &Subtarget,
14654 SelectionDAG &DAG) {
14655 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14657 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14659
14660 if (Subtarget.hasFP16()) {
14661 if (NumV2Elements == 0) {
14662 // Check for being able to broadcast a single element.
14663 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14664 Mask, Subtarget, DAG))
14665 return Broadcast;
14666 }
14667 if (NumV2Elements == 1 && Mask[0] >= 8)
14669 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14670 return V;
14671 }
14672
14673 V1 = DAG.getBitcast(MVT::v8i16, V1);
14674 V2 = DAG.getBitcast(MVT::v8i16, V2);
14675 return DAG.getBitcast(MVT::v8f16,
14676 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14677}
14678
14679// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14680// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14681// the active subvector is extracted.
14683 ArrayRef<int> OriginalMask, SDValue V1,
14684 SDValue V2, const X86Subtarget &Subtarget,
14685 SelectionDAG &DAG) {
14686 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14687 SmallVector<int, 32> Mask(OriginalMask);
14688 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14689 !isShuffleFoldableLoad(V2)) {
14691 std::swap(V1, V2);
14692 }
14693
14694 MVT MaskVT = VT.changeTypeToInteger();
14695 SDValue MaskNode;
14696 MVT ShuffleVT = VT;
14697 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14698 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14699 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14700 ShuffleVT = V1.getSimpleValueType();
14701
14702 // Adjust mask to correct indices for the second input.
14703 int NumElts = VT.getVectorNumElements();
14704 unsigned Scale = 512 / VT.getSizeInBits();
14705 SmallVector<int, 32> AdjustedMask(Mask);
14706 for (int &M : AdjustedMask)
14707 if (NumElts <= M)
14708 M += (Scale - 1) * NumElts;
14709 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14710 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14711 } else {
14712 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14713 }
14714
14715 SDValue Result;
14716 if (V2.isUndef())
14717 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14718 else
14719 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14720
14721 if (VT != ShuffleVT)
14722 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14723
14724 return Result;
14725}
14726
14727/// Generic lowering of v16i8 shuffles.
14728///
14729/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14730/// detect any complexity reducing interleaving. If that doesn't help, it uses
14731/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14732/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14733/// back together.
14735 const APInt &Zeroable, SDValue V1, SDValue V2,
14736 const X86Subtarget &Subtarget,
14737 SelectionDAG &DAG) {
14738 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14740 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14741
14742 // Try to use shift instructions.
14743 if (SDValue Shift =
14744 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14745 DAG, /*BitwiseOnly*/ false))
14746 return Shift;
14747
14748 // Try to use byte rotation instructions.
14749 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14750 Subtarget, DAG))
14751 return Rotate;
14752
14753 // Use dedicated pack instructions for masks that match their pattern.
14754 if (SDValue V =
14755 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14756 return V;
14757
14758 // Try to use a zext lowering.
14759 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14760 Zeroable, Subtarget, DAG))
14761 return ZExt;
14762
14763 // Try to use lower using a truncation.
14764 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14765 Subtarget, DAG))
14766 return V;
14767
14768 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14769 Subtarget, DAG))
14770 return V;
14771
14772 // See if we can use SSE4A Extraction / Insertion.
14773 if (Subtarget.hasSSE4A())
14774 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14775 Zeroable, DAG))
14776 return V;
14777
14778 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14779
14780 // For single-input shuffles, there are some nicer lowering tricks we can use.
14781 if (NumV2Elements == 0) {
14782 // Check for being able to broadcast a single element.
14783 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14784 Mask, Subtarget, DAG))
14785 return Broadcast;
14786
14787 // Try to use bit rotation instructions.
14788 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14789 Subtarget, DAG))
14790 return Rotate;
14791
14792 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14793 return V;
14794
14795 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14796 // Notably, this handles splat and partial-splat shuffles more efficiently.
14797 // However, it only makes sense if the pre-duplication shuffle simplifies
14798 // things significantly. Currently, this means we need to be able to
14799 // express the pre-duplication shuffle as an i16 shuffle.
14800 //
14801 // FIXME: We should check for other patterns which can be widened into an
14802 // i16 shuffle as well.
14803 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14804 for (int i = 0; i < 16; i += 2)
14805 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14806 return false;
14807
14808 return true;
14809 };
14810 auto tryToWidenViaDuplication = [&]() -> SDValue {
14811 if (!canWidenViaDuplication(Mask))
14812 return SDValue();
14813 SmallVector<int, 4> LoInputs;
14814 copy_if(Mask, std::back_inserter(LoInputs),
14815 [](int M) { return M >= 0 && M < 8; });
14816 array_pod_sort(LoInputs.begin(), LoInputs.end());
14817 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14818 SmallVector<int, 4> HiInputs;
14819 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14820 array_pod_sort(HiInputs.begin(), HiInputs.end());
14821 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14822
14823 bool TargetLo = LoInputs.size() >= HiInputs.size();
14824 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14825 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14826
14827 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14829 for (int I : InPlaceInputs) {
14830 PreDupI16Shuffle[I/2] = I/2;
14831 LaneMap[I] = I;
14832 }
14833 int j = TargetLo ? 0 : 4, je = j + 4;
14834 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14835 // Check if j is already a shuffle of this input. This happens when
14836 // there are two adjacent bytes after we move the low one.
14837 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14838 // If we haven't yet mapped the input, search for a slot into which
14839 // we can map it.
14840 while (j < je && PreDupI16Shuffle[j] >= 0)
14841 ++j;
14842
14843 if (j == je)
14844 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14845 return SDValue();
14846
14847 // Map this input with the i16 shuffle.
14848 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14849 }
14850
14851 // Update the lane map based on the mapping we ended up with.
14852 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14853 }
14854 V1 = DAG.getBitcast(
14855 MVT::v16i8,
14856 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14857 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14858
14859 // Unpack the bytes to form the i16s that will be shuffled into place.
14860 bool EvenInUse = false, OddInUse = false;
14861 for (int i = 0; i < 16; i += 2) {
14862 EvenInUse |= (Mask[i + 0] >= 0);
14863 OddInUse |= (Mask[i + 1] >= 0);
14864 if (EvenInUse && OddInUse)
14865 break;
14866 }
14867 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14868 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14869 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14870
14871 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14872 for (int i = 0; i < 16; ++i)
14873 if (Mask[i] >= 0) {
14874 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14875 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14876 if (PostDupI16Shuffle[i / 2] < 0)
14877 PostDupI16Shuffle[i / 2] = MappedMask;
14878 else
14879 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14880 "Conflicting entries in the original shuffle!");
14881 }
14882 return DAG.getBitcast(
14883 MVT::v16i8,
14884 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14885 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14886 };
14887 if (SDValue V = tryToWidenViaDuplication())
14888 return V;
14889 }
14890
14891 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14892 Zeroable, Subtarget, DAG))
14893 return Masked;
14894
14895 // Use dedicated unpack instructions for masks that match their pattern.
14896 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14897 return V;
14898
14899 // Try to use byte shift instructions to mask.
14900 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14901 Zeroable, Subtarget, DAG))
14902 return V;
14903
14904 // Check for compaction patterns.
14905 bool IsSingleInput = V2.isUndef();
14906 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14907
14908 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14909 // with PSHUFB. It is important to do this before we attempt to generate any
14910 // blends but after all of the single-input lowerings. If the single input
14911 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14912 // want to preserve that and we can DAG combine any longer sequences into
14913 // a PSHUFB in the end. But once we start blending from multiple inputs,
14914 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14915 // and there are *very* few patterns that would actually be faster than the
14916 // PSHUFB approach because of its ability to zero lanes.
14917 //
14918 // If the mask is a binary compaction, we can more efficiently perform this
14919 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14920 //
14921 // FIXME: The only exceptions to the above are blends which are exact
14922 // interleavings with direct instructions supporting them. We currently don't
14923 // handle those well here.
14924 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14925 bool V1InUse = false;
14926 bool V2InUse = false;
14927
14929 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14930
14931 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14932 // do so. This avoids using them to handle blends-with-zero which is
14933 // important as a single pshufb is significantly faster for that.
14934 if (V1InUse && V2InUse) {
14935 if (Subtarget.hasSSE41())
14936 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14937 Zeroable, Subtarget, DAG))
14938 return Blend;
14939
14940 // We can use an unpack to do the blending rather than an or in some
14941 // cases. Even though the or may be (very minorly) more efficient, we
14942 // preference this lowering because there are common cases where part of
14943 // the complexity of the shuffles goes away when we do the final blend as
14944 // an unpack.
14945 // FIXME: It might be worth trying to detect if the unpack-feeding
14946 // shuffles will both be pshufb, in which case we shouldn't bother with
14947 // this.
14949 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14950 return Unpack;
14951
14952 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14953 if (Subtarget.hasVBMI())
14954 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14955 DAG);
14956
14957 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14958 if (Subtarget.hasXOP()) {
14959 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14960 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14961 }
14962
14963 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14964 // PALIGNR will be cheaper than the second PSHUFB+OR.
14966 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14967 return V;
14968 }
14969
14970 return PSHUFB;
14971 }
14972
14973 // There are special ways we can lower some single-element blends.
14974 if (NumV2Elements == 1)
14976 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14977 return V;
14978
14979 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14980 return Blend;
14981
14982 // Check whether a compaction lowering can be done. This handles shuffles
14983 // which take every Nth element for some even N. See the helper function for
14984 // details.
14985 //
14986 // We special case these as they can be particularly efficiently handled with
14987 // the PACKUSB instruction on x86 and they show up in common patterns of
14988 // rearranging bytes to truncate wide elements.
14989 if (NumEvenDrops) {
14990 // NumEvenDrops is the power of two stride of the elements. Another way of
14991 // thinking about it is that we need to drop the even elements this many
14992 // times to get the original input.
14993
14994 // First we need to zero all the dropped bytes.
14995 assert(NumEvenDrops <= 3 &&
14996 "No support for dropping even elements more than 3 times.");
14997 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14998 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14999 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15000 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15001 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15002 WordClearMask);
15003 if (!IsSingleInput)
15004 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15005 WordClearMask);
15006
15007 // Now pack things back together.
15008 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15009 IsSingleInput ? V1 : V2);
15010 for (int i = 1; i < NumEvenDrops; ++i) {
15011 Result = DAG.getBitcast(MVT::v8i16, Result);
15012 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15013 }
15014 return Result;
15015 }
15016
15017 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15018 if (NumOddDrops == 1) {
15019 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15020 DAG.getBitcast(MVT::v8i16, V1),
15021 DAG.getTargetConstant(8, DL, MVT::i8));
15022 if (!IsSingleInput)
15023 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15024 DAG.getBitcast(MVT::v8i16, V2),
15025 DAG.getTargetConstant(8, DL, MVT::i8));
15026 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15027 IsSingleInput ? V1 : V2);
15028 }
15029
15030 // Handle multi-input cases by blending/unpacking single-input shuffles.
15031 if (NumV2Elements > 0)
15032 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15033 Zeroable, Subtarget, DAG);
15034
15035 // The fallback path for single-input shuffles widens this into two v8i16
15036 // vectors with unpacks, shuffles those, and then pulls them back together
15037 // with a pack.
15038 SDValue V = V1;
15039
15040 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15042 for (int i = 0; i < 16; ++i)
15043 if (Mask[i] >= 0)
15044 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15045
15046 SDValue VLoHalf, VHiHalf;
15047 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15048 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15049 // i16s.
15050 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15051 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15052 // Use a mask to drop the high bytes.
15053 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15054 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15055 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15056
15057 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15058 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15059
15060 // Squash the masks to point directly into VLoHalf.
15061 for (int &M : LoBlendMask)
15062 if (M >= 0)
15063 M /= 2;
15064 for (int &M : HiBlendMask)
15065 if (M >= 0)
15066 M /= 2;
15067 } else {
15068 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15069 // VHiHalf so that we can blend them as i16s.
15070 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15071
15072 VLoHalf = DAG.getBitcast(
15073 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15074 VHiHalf = DAG.getBitcast(
15075 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15076 }
15077
15078 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15079 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15080
15081 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15082}
15083
15084/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15085///
15086/// This routine breaks down the specific type of 128-bit shuffle and
15087/// dispatches to the lowering routines accordingly.
15089 MVT VT, SDValue V1, SDValue V2,
15090 const APInt &Zeroable,
15091 const X86Subtarget &Subtarget,
15092 SelectionDAG &DAG) {
15093 if (VT == MVT::v8bf16) {
15094 V1 = DAG.getBitcast(MVT::v8i16, V1);
15095 V2 = DAG.getBitcast(MVT::v8i16, V2);
15096 return DAG.getBitcast(VT,
15097 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15098 }
15099
15100 switch (VT.SimpleTy) {
15101 case MVT::v2i64:
15102 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15103 case MVT::v2f64:
15104 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v4i32:
15106 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v4f32:
15108 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v8i16:
15110 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v8f16:
15112 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v16i8:
15114 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115
15116 default:
15117 llvm_unreachable("Unimplemented!");
15118 }
15119}
15120
15121/// Generic routine to split vector shuffle into half-sized shuffles.
15122///
15123/// This routine just extracts two subvectors, shuffles them independently, and
15124/// then concatenates them back together. This should work effectively with all
15125/// AVX vector shuffle types.
15127 SDValue V2, ArrayRef<int> Mask,
15128 SelectionDAG &DAG, bool SimpleOnly) {
15129 assert(VT.getSizeInBits() >= 256 &&
15130 "Only for 256-bit or wider vector shuffles!");
15131 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15132 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15133
15134 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15135 if (VT == MVT::v8f32) {
15136 SDValue BC1 = peekThroughBitcasts(V1);
15137 SDValue BC2 = peekThroughBitcasts(V2);
15138 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15139 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15140 DAG, SimpleOnly))
15141 return DAG.getBitcast(VT, Split);
15142 }
15143 }
15144
15145 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15146 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15147
15148 int NumElements = VT.getVectorNumElements();
15149 int SplitNumElements = NumElements / 2;
15150 MVT ScalarVT = VT.getVectorElementType();
15151 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15152
15153 // Use splitVector/extractSubVector so that split build-vectors just build two
15154 // narrower build vectors. This helps shuffling with splats and zeros.
15155 auto SplitVector = [&](SDValue V) {
15156 SDValue LoV, HiV;
15157 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15158 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15159 DAG.getBitcast(SplitVT, HiV));
15160 };
15161
15162 SDValue LoV1, HiV1, LoV2, HiV2;
15163 std::tie(LoV1, HiV1) = SplitVector(V1);
15164 std::tie(LoV2, HiV2) = SplitVector(V2);
15165
15166 // Now create two 4-way blends of these half-width vectors.
15167 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15168 bool &UseHiV1, bool &UseLoV2,
15169 bool &UseHiV2) {
15170 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15171 for (int i = 0; i < SplitNumElements; ++i) {
15172 int M = HalfMask[i];
15173 if (M >= NumElements) {
15174 if (M >= NumElements + SplitNumElements)
15175 UseHiV2 = true;
15176 else
15177 UseLoV2 = true;
15178 } else if (M >= 0) {
15179 if (M >= SplitNumElements)
15180 UseHiV1 = true;
15181 else
15182 UseLoV1 = true;
15183 }
15184 }
15185 };
15186
15187 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15188 if (!SimpleOnly)
15189 return true;
15190
15191 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15192 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15193
15194 return !(UseHiV1 || UseHiV2);
15195 };
15196
15197 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15198 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15200 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15201 for (int i = 0; i < SplitNumElements; ++i) {
15202 int M = HalfMask[i];
15203 if (M >= NumElements) {
15204 V2BlendMask[i] = M - NumElements;
15205 BlendMask[i] = SplitNumElements + i;
15206 } else if (M >= 0) {
15207 V1BlendMask[i] = M;
15208 BlendMask[i] = i;
15209 }
15210 }
15211
15212 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15213 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15214
15215 // Because the lowering happens after all combining takes place, we need to
15216 // manually combine these blend masks as much as possible so that we create
15217 // a minimal number of high-level vector shuffle nodes.
15218 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15219
15220 // First try just blending the halves of V1 or V2.
15221 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15222 return DAG.getUNDEF(SplitVT);
15223 if (!UseLoV2 && !UseHiV2)
15224 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15225 if (!UseLoV1 && !UseHiV1)
15226 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15227
15228 SDValue V1Blend, V2Blend;
15229 if (UseLoV1 && UseHiV1) {
15230 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15231 } else {
15232 // We only use half of V1 so map the usage down into the final blend mask.
15233 V1Blend = UseLoV1 ? LoV1 : HiV1;
15234 for (int i = 0; i < SplitNumElements; ++i)
15235 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15236 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15237 }
15238 if (UseLoV2 && UseHiV2) {
15239 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15240 } else {
15241 // We only use half of V2 so map the usage down into the final blend mask.
15242 V2Blend = UseLoV2 ? LoV2 : HiV2;
15243 for (int i = 0; i < SplitNumElements; ++i)
15244 if (BlendMask[i] >= SplitNumElements)
15245 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15246 }
15247 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15248 };
15249
15250 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15251 return SDValue();
15252
15253 SDValue Lo = HalfBlend(LoMask);
15254 SDValue Hi = HalfBlend(HiMask);
15255 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15256}
15257
15258/// Either split a vector in halves or decompose the shuffles and the
15259/// blend/unpack.
15260///
15261/// This is provided as a good fallback for many lowerings of non-single-input
15262/// shuffles with more than one 128-bit lane. In those cases, we want to select
15263/// between splitting the shuffle into 128-bit components and stitching those
15264/// back together vs. extracting the single-input shuffles and blending those
15265/// results.
15267 SDValue V2, ArrayRef<int> Mask,
15268 const APInt &Zeroable,
15269 const X86Subtarget &Subtarget,
15270 SelectionDAG &DAG) {
15271 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15272 "shuffles as it could then recurse on itself.");
15273 int Size = Mask.size();
15274
15275 // If this can be modeled as a broadcast of two elements followed by a blend,
15276 // prefer that lowering. This is especially important because broadcasts can
15277 // often fold with memory operands.
15278 auto DoBothBroadcast = [&] {
15279 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15280 for (int M : Mask)
15281 if (M >= Size) {
15282 if (V2BroadcastIdx < 0)
15283 V2BroadcastIdx = M - Size;
15284 else if ((M - Size) != V2BroadcastIdx &&
15285 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15286 return false;
15287 } else if (M >= 0) {
15288 if (V1BroadcastIdx < 0)
15289 V1BroadcastIdx = M;
15290 else if (M != V1BroadcastIdx &&
15291 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15292 return false;
15293 }
15294 return true;
15295 };
15296 if (DoBothBroadcast())
15297 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15298 Subtarget, DAG);
15299
15300 // If the inputs all stem from a single 128-bit lane of each input, then we
15301 // split them rather than blending because the split will decompose to
15302 // unusually few instructions.
15303 int LaneCount = VT.getSizeInBits() / 128;
15304 int LaneSize = Size / LaneCount;
15305 SmallBitVector LaneInputs[2];
15306 LaneInputs[0].resize(LaneCount, false);
15307 LaneInputs[1].resize(LaneCount, false);
15308 for (int i = 0; i < Size; ++i)
15309 if (Mask[i] >= 0)
15310 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15311 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15312 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15313 /*SimpleOnly*/ false);
15314
15315 // Without AVX2, if we can freely split the subvectors then we're better off
15316 // performing half width shuffles.
15317 if (!Subtarget.hasAVX2()) {
15318 SDValue BC1 = peekThroughBitcasts(V1);
15319 SDValue BC2 = peekThroughBitcasts(V2);
15320 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15321 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15322 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15323 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15324 if (SplatOrSplitV1 && SplatOrSplitV2)
15325 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15326 /*SimpleOnly*/ false);
15327 }
15328
15329 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15330 // requires that the decomposed single-input shuffles don't end up here.
15331 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15332 Subtarget, DAG);
15333}
15334
15335// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15336// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15338 SDValue V1, SDValue V2,
15339 ArrayRef<int> Mask,
15340 SelectionDAG &DAG) {
15341 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15342
15343 int LHSMask[4] = {-1, -1, -1, -1};
15344 int RHSMask[4] = {-1, -1, -1, -1};
15345 int SHUFPDMask[4] = {-1, -1, -1, -1};
15346
15347 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15348 // perform the shuffle once the lanes have been shuffled in place.
15349 for (int i = 0; i != 4; ++i) {
15350 int M = Mask[i];
15351 if (M < 0)
15352 continue;
15353 int LaneBase = i & ~1;
15354 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15355 LaneMask[LaneBase + (M & 1)] = M;
15356 SHUFPDMask[i] = M & 1;
15357 }
15358
15359 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15360 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15361 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15362 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15363}
15364
15365/// Lower a vector shuffle crossing multiple 128-bit lanes as
15366/// a lane permutation followed by a per-lane permutation.
15367///
15368/// This is mainly for cases where we can have non-repeating permutes
15369/// in each lane.
15370///
15371/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15372/// we should investigate merging them.
15374 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15375 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15376 int NumElts = VT.getVectorNumElements();
15377 int NumLanes = VT.getSizeInBits() / 128;
15378 int NumEltsPerLane = NumElts / NumLanes;
15379 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15380
15381 /// Attempts to find a sublane permute with the given size
15382 /// that gets all elements into their target lanes.
15383 ///
15384 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15385 /// If unsuccessful, returns false and may overwrite InLaneMask.
15386 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15387 int NumSublanesPerLane = NumSublanes / NumLanes;
15388 int NumEltsPerSublane = NumElts / NumSublanes;
15389
15390 SmallVector<int, 16> CrossLaneMask;
15391 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15392 // CrossLaneMask but one entry == one sublane.
15393 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15394 APInt DemandedCrossLane = APInt::getZero(NumElts);
15395
15396 for (int i = 0; i != NumElts; ++i) {
15397 int M = Mask[i];
15398 if (M < 0)
15399 continue;
15400
15401 int SrcSublane = M / NumEltsPerSublane;
15402 int DstLane = i / NumEltsPerLane;
15403
15404 // We only need to get the elements into the right lane, not sublane.
15405 // So search all sublanes that make up the destination lane.
15406 bool Found = false;
15407 int DstSubStart = DstLane * NumSublanesPerLane;
15408 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15409 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15410 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15411 continue;
15412
15413 Found = true;
15414 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15415 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15416 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15417 DemandedCrossLane.setBit(InLaneMask[i]);
15418 break;
15419 }
15420 if (!Found)
15421 return SDValue();
15422 }
15423
15424 // Fill CrossLaneMask using CrossLaneMaskLarge.
15425 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15426
15427 if (!CanUseSublanes) {
15428 // If we're only shuffling a single lowest lane and the rest are identity
15429 // then don't bother.
15430 // TODO - isShuffleMaskInputInPlace could be extended to something like
15431 // this.
15432 int NumIdentityLanes = 0;
15433 bool OnlyShuffleLowestLane = true;
15434 for (int i = 0; i != NumLanes; ++i) {
15435 int LaneOffset = i * NumEltsPerLane;
15436 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15437 i * NumEltsPerLane))
15438 NumIdentityLanes++;
15439 else if (CrossLaneMask[LaneOffset] != 0)
15440 OnlyShuffleLowestLane = false;
15441 }
15442 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15443 return SDValue();
15444 }
15445
15446 // Simplify CrossLaneMask based on the actual demanded elements.
15447 if (V1.hasOneUse())
15448 for (int i = 0; i != NumElts; ++i)
15449 if (!DemandedCrossLane[i])
15450 CrossLaneMask[i] = SM_SentinelUndef;
15451
15452 // Avoid returning the same shuffle operation. For example,
15453 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15454 // undef:v16i16
15455 if (CrossLaneMask == Mask || InLaneMask == Mask)
15456 return SDValue();
15457
15458 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15459 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15460 InLaneMask);
15461 };
15462
15463 // First attempt a solution with full lanes.
15464 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15465 return V;
15466
15467 // The rest of the solutions use sublanes.
15468 if (!CanUseSublanes)
15469 return SDValue();
15470
15471 // Then attempt a solution with 64-bit sublanes (vpermq).
15472 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15473 return V;
15474
15475 // If that doesn't work and we have fast variable cross-lane shuffle,
15476 // attempt 32-bit sublanes (vpermd).
15477 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15478 return SDValue();
15479
15480 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15481}
15482
15483/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15484static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15485 SmallVector<int> &InLaneMask) {
15486 int Size = Mask.size();
15487 InLaneMask.assign(Mask.begin(), Mask.end());
15488 for (int i = 0; i < Size; ++i) {
15489 int &M = InLaneMask[i];
15490 if (M < 0)
15491 continue;
15492 if (((M % Size) / LaneSize) != (i / LaneSize))
15493 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15494 }
15495}
15496
15497/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15498/// source with a lane permutation.
15499///
15500/// This lowering strategy results in four instructions in the worst case for a
15501/// single-input cross lane shuffle which is lower than any other fully general
15502/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15503/// shuffle pattern should be handled prior to trying this lowering.
15505 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15506 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15507 // FIXME: This should probably be generalized for 512-bit vectors as well.
15508 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15509 int Size = Mask.size();
15510 int LaneSize = Size / 2;
15511
15512 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15513 // Only do this if the elements aren't all from the lower lane,
15514 // otherwise we're (probably) better off doing a split.
15515 if (VT == MVT::v4f64 &&
15516 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15517 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15518
15519 // If there are only inputs from one 128-bit lane, splitting will in fact be
15520 // less expensive. The flags track whether the given lane contains an element
15521 // that crosses to another lane.
15522 bool AllLanes;
15523 if (!Subtarget.hasAVX2()) {
15524 bool LaneCrossing[2] = {false, false};
15525 for (int i = 0; i < Size; ++i)
15526 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15527 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15528 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15529 } else {
15530 bool LaneUsed[2] = {false, false};
15531 for (int i = 0; i < Size; ++i)
15532 if (Mask[i] >= 0)
15533 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15534 AllLanes = LaneUsed[0] && LaneUsed[1];
15535 }
15536
15537 // TODO - we could support shuffling V2 in the Flipped input.
15538 assert(V2.isUndef() &&
15539 "This last part of this routine only works on single input shuffles");
15540
15541 SmallVector<int> InLaneMask;
15542 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15543
15544 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15545 "In-lane shuffle mask expected");
15546
15547 // If we're not using both lanes in each lane and the inlane mask is not
15548 // repeating, then we're better off splitting.
15549 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15550 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15551 /*SimpleOnly*/ false);
15552
15553 // Flip the lanes, and shuffle the results which should now be in-lane.
15554 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15555 SDValue Flipped = DAG.getBitcast(PVT, V1);
15556 Flipped =
15557 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15558 Flipped = DAG.getBitcast(VT, Flipped);
15559 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15560}
15561
15562/// Handle lowering 2-lane 128-bit shuffles.
15564 SDValue V2, ArrayRef<int> Mask,
15565 const APInt &Zeroable,
15566 const X86Subtarget &Subtarget,
15567 SelectionDAG &DAG) {
15568 if (V2.isUndef()) {
15569 // Attempt to match VBROADCAST*128 subvector broadcast load.
15570 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15571 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15572 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15574 MVT MemVT = VT.getHalfNumVectorElementsVT();
15575 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15578 VT, MemVT, Ld, Ofs, DAG))
15579 return BcstLd;
15580 }
15581
15582 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15583 if (Subtarget.hasAVX2())
15584 return SDValue();
15585 }
15586
15587 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15588
15589 SmallVector<int, 4> WidenedMask;
15590 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15591 return SDValue();
15592
15593 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15594 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15595
15596 // Try to use an insert into a zero vector.
15597 if (WidenedMask[0] == 0 && IsHighZero) {
15598 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15599 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15600 DAG.getVectorIdxConstant(0, DL));
15601 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15602 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15603 DAG.getVectorIdxConstant(0, DL));
15604 }
15605
15606 // TODO: If minimizing size and one of the inputs is a zero vector and the
15607 // the zero vector has only one use, we could use a VPERM2X128 to save the
15608 // instruction bytes needed to explicitly generate the zero vector.
15609
15610 // Blends are faster and handle all the non-lane-crossing cases.
15611 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15612 Subtarget, DAG))
15613 return Blend;
15614
15615 // If either input operand is a zero vector, use VPERM2X128 because its mask
15616 // allows us to replace the zero input with an implicit zero.
15617 if (!IsLowZero && !IsHighZero) {
15618 // Check for patterns which can be matched with a single insert of a 128-bit
15619 // subvector.
15620 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15621 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15622
15623 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15624 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15626 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15627 SDValue SubVec =
15628 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15629 DAG.getVectorIdxConstant(0, DL));
15630 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15631 DAG.getVectorIdxConstant(2, DL));
15632 }
15633 }
15634
15635 // Try to use SHUF128 if possible.
15636 if (Subtarget.hasVLX()) {
15637 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15638 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15639 ((WidenedMask[1] % 2) << 1);
15640 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15641 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15642 }
15643 }
15644 }
15645
15646 // Otherwise form a 128-bit permutation. After accounting for undefs,
15647 // convert the 64-bit shuffle mask selection values into 128-bit
15648 // selection bits by dividing the indexes by 2 and shifting into positions
15649 // defined by a vperm2*128 instruction's immediate control byte.
15650
15651 // The immediate permute control byte looks like this:
15652 // [1:0] - select 128 bits from sources for low half of destination
15653 // [2] - ignore
15654 // [3] - zero low half of destination
15655 // [5:4] - select 128 bits from sources for high half of destination
15656 // [6] - ignore
15657 // [7] - zero high half of destination
15658
15659 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15660 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15661
15662 unsigned PermMask = 0;
15663 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15664 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15665
15666 // Check the immediate mask and replace unused sources with undef.
15667 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15668 V1 = DAG.getUNDEF(VT);
15669 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15670 V2 = DAG.getUNDEF(VT);
15671
15672 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15673 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15674}
15675
15676/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15677/// shuffling each lane.
15678///
15679/// This attempts to create a repeated lane shuffle where each lane uses one
15680/// or two of the lanes of the inputs. The lanes of the input vectors are
15681/// shuffled in one or two independent shuffles to get the lanes into the
15682/// position needed by the final shuffle.
15684 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15685 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15686 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15687
15688 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15689 return SDValue();
15690
15691 int NumElts = Mask.size();
15692 int NumLanes = VT.getSizeInBits() / 128;
15693 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15694 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15695 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15696
15697 // First pass will try to fill in the RepeatMask from lanes that need two
15698 // sources.
15699 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15700 int Srcs[2] = {-1, -1};
15701 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15702 for (int i = 0; i != NumLaneElts; ++i) {
15703 int M = Mask[(Lane * NumLaneElts) + i];
15704 if (M < 0)
15705 continue;
15706 // Determine which of the possible input lanes (NumLanes from each source)
15707 // this element comes from. Assign that as one of the sources for this
15708 // lane. We can assign up to 2 sources for this lane. If we run out
15709 // sources we can't do anything.
15710 int LaneSrc = M / NumLaneElts;
15711 int Src;
15712 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15713 Src = 0;
15714 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15715 Src = 1;
15716 else
15717 return SDValue();
15718
15719 Srcs[Src] = LaneSrc;
15720 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15721 }
15722
15723 // If this lane has two sources, see if it fits with the repeat mask so far.
15724 if (Srcs[1] < 0)
15725 continue;
15726
15727 LaneSrcs[Lane][0] = Srcs[0];
15728 LaneSrcs[Lane][1] = Srcs[1];
15729
15730 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15731 assert(M1.size() == M2.size() && "Unexpected mask size");
15732 for (int i = 0, e = M1.size(); i != e; ++i)
15733 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15734 return false;
15735 return true;
15736 };
15737
15738 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15739 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15740 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15741 int M = Mask[i];
15742 if (M < 0)
15743 continue;
15744 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15745 "Unexpected mask element");
15746 MergedMask[i] = M;
15747 }
15748 };
15749
15750 if (MatchMasks(InLaneMask, RepeatMask)) {
15751 // Merge this lane mask into the final repeat mask.
15752 MergeMasks(InLaneMask, RepeatMask);
15753 continue;
15754 }
15755
15756 // Didn't find a match. Swap the operands and try again.
15757 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15759
15760 if (MatchMasks(InLaneMask, RepeatMask)) {
15761 // Merge this lane mask into the final repeat mask.
15762 MergeMasks(InLaneMask, RepeatMask);
15763 continue;
15764 }
15765
15766 // Couldn't find a match with the operands in either order.
15767 return SDValue();
15768 }
15769
15770 // Now handle any lanes with only one source.
15771 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15772 // If this lane has already been processed, skip it.
15773 if (LaneSrcs[Lane][0] >= 0)
15774 continue;
15775
15776 for (int i = 0; i != NumLaneElts; ++i) {
15777 int M = Mask[(Lane * NumLaneElts) + i];
15778 if (M < 0)
15779 continue;
15780
15781 // If RepeatMask isn't defined yet we can define it ourself.
15782 if (RepeatMask[i] < 0)
15783 RepeatMask[i] = M % NumLaneElts;
15784
15785 if (RepeatMask[i] < NumElts) {
15786 if (RepeatMask[i] != M % NumLaneElts)
15787 return SDValue();
15788 LaneSrcs[Lane][0] = M / NumLaneElts;
15789 } else {
15790 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15791 return SDValue();
15792 LaneSrcs[Lane][1] = M / NumLaneElts;
15793 }
15794 }
15795
15796 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15797 return SDValue();
15798 }
15799
15800 SmallVector<int, 16> NewMask(NumElts, -1);
15801 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15802 int Src = LaneSrcs[Lane][0];
15803 for (int i = 0; i != NumLaneElts; ++i) {
15804 int M = -1;
15805 if (Src >= 0)
15806 M = Src * NumLaneElts + i;
15807 NewMask[Lane * NumLaneElts + i] = M;
15808 }
15809 }
15810 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15811 // Ensure we didn't get back the shuffle we started with.
15812 // FIXME: This is a hack to make up for some splat handling code in
15813 // getVectorShuffle.
15814 if (isa<ShuffleVectorSDNode>(NewV1) &&
15815 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15816 return SDValue();
15817
15818 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15819 int Src = LaneSrcs[Lane][1];
15820 for (int i = 0; i != NumLaneElts; ++i) {
15821 int M = -1;
15822 if (Src >= 0)
15823 M = Src * NumLaneElts + i;
15824 NewMask[Lane * NumLaneElts + i] = M;
15825 }
15826 }
15827 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15828 // Ensure we didn't get back the shuffle we started with.
15829 // FIXME: This is a hack to make up for some splat handling code in
15830 // getVectorShuffle.
15831 if (isa<ShuffleVectorSDNode>(NewV2) &&
15832 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15833 return SDValue();
15834
15835 for (int i = 0; i != NumElts; ++i) {
15836 if (Mask[i] < 0) {
15837 NewMask[i] = -1;
15838 continue;
15839 }
15840 NewMask[i] = RepeatMask[i % NumLaneElts];
15841 if (NewMask[i] < 0)
15842 continue;
15843
15844 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15845 }
15846 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15847}
15848
15849/// If the input shuffle mask results in a vector that is undefined in all upper
15850/// or lower half elements and that mask accesses only 2 halves of the
15851/// shuffle's operands, return true. A mask of half the width with mask indexes
15852/// adjusted to access the extracted halves of the original shuffle operands is
15853/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15854/// lower half of each input operand is accessed.
15855static bool
15857 int &HalfIdx1, int &HalfIdx2) {
15858 assert((Mask.size() == HalfMask.size() * 2) &&
15859 "Expected input mask to be twice as long as output");
15860
15861 // Exactly one half of the result must be undef to allow narrowing.
15862 bool UndefLower = isUndefLowerHalf(Mask);
15863 bool UndefUpper = isUndefUpperHalf(Mask);
15864 if (UndefLower == UndefUpper)
15865 return false;
15866
15867 unsigned HalfNumElts = HalfMask.size();
15868 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15869 HalfIdx1 = -1;
15870 HalfIdx2 = -1;
15871 for (unsigned i = 0; i != HalfNumElts; ++i) {
15872 int M = Mask[i + MaskIndexOffset];
15873 if (M < 0) {
15874 HalfMask[i] = M;
15875 continue;
15876 }
15877
15878 // Determine which of the 4 half vectors this element is from.
15879 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15880 int HalfIdx = M / HalfNumElts;
15881
15882 // Determine the element index into its half vector source.
15883 int HalfElt = M % HalfNumElts;
15884
15885 // We can shuffle with up to 2 half vectors, set the new 'half'
15886 // shuffle mask accordingly.
15887 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15888 HalfMask[i] = HalfElt;
15889 HalfIdx1 = HalfIdx;
15890 continue;
15891 }
15892 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15893 HalfMask[i] = HalfElt + HalfNumElts;
15894 HalfIdx2 = HalfIdx;
15895 continue;
15896 }
15897
15898 // Too many half vectors referenced.
15899 return false;
15900 }
15901
15902 return true;
15903}
15904
15905/// Given the output values from getHalfShuffleMask(), create a half width
15906/// shuffle of extracted vectors followed by an insert back to full width.
15908 ArrayRef<int> HalfMask, int HalfIdx1,
15909 int HalfIdx2, bool UndefLower,
15910 SelectionDAG &DAG, bool UseConcat = false) {
15911 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15912 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15913
15914 MVT VT = V1.getSimpleValueType();
15915 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15916 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15917
15918 auto getHalfVector = [&](int HalfIdx) {
15919 if (HalfIdx < 0)
15920 return DAG.getUNDEF(HalfVT);
15921 SDValue V = (HalfIdx < 2 ? V1 : V2);
15922 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15923 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15924 DAG.getVectorIdxConstant(HalfIdx, DL));
15925 };
15926
15927 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15928 SDValue Half1 = getHalfVector(HalfIdx1);
15929 SDValue Half2 = getHalfVector(HalfIdx2);
15930 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15931 if (UseConcat) {
15932 SDValue Op0 = V;
15933 SDValue Op1 = DAG.getUNDEF(HalfVT);
15934 if (UndefLower)
15935 std::swap(Op0, Op1);
15936 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15937 }
15938
15939 unsigned Offset = UndefLower ? HalfNumElts : 0;
15940 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15942}
15943
15944/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15945/// This allows for fast cases such as subvector extraction/insertion
15946/// or shuffling smaller vector types which can lower more efficiently.
15948 SDValue V2, ArrayRef<int> Mask,
15949 const X86Subtarget &Subtarget,
15950 SelectionDAG &DAG) {
15951 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15952 "Expected 256-bit or 512-bit vector");
15953
15954 bool UndefLower = isUndefLowerHalf(Mask);
15955 if (!UndefLower && !isUndefUpperHalf(Mask))
15956 return SDValue();
15957
15958 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15959 "Completely undef shuffle mask should have been simplified already");
15960
15961 // Upper half is undef and lower half is whole upper subvector.
15962 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15963 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15964 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15965 if (!UndefLower &&
15966 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15967 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15968 DAG.getVectorIdxConstant(HalfNumElts, DL));
15969 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15970 DAG.getVectorIdxConstant(0, DL));
15971 }
15972
15973 // Lower half is undef and upper half is whole lower subvector.
15974 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15975 if (UndefLower &&
15976 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15977 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15978 DAG.getVectorIdxConstant(0, DL));
15979 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15980 DAG.getVectorIdxConstant(HalfNumElts, DL));
15981 }
15982
15983 int HalfIdx1, HalfIdx2;
15984 SmallVector<int, 8> HalfMask(HalfNumElts);
15985 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15986 return SDValue();
15987
15988 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15989
15990 // Only shuffle the halves of the inputs when useful.
15991 unsigned NumLowerHalves =
15992 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15993 unsigned NumUpperHalves =
15994 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15995 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15996
15997 // Determine the larger pattern of undef/halves, then decide if it's worth
15998 // splitting the shuffle based on subtarget capabilities and types.
15999 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16000 if (!UndefLower) {
16001 // XXXXuuuu: no insert is needed.
16002 // Always extract lowers when setting lower - these are all free subreg ops.
16003 if (NumUpperHalves == 0)
16004 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16005 UndefLower, DAG);
16006
16007 if (NumUpperHalves == 1) {
16008 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16009 if (Subtarget.hasAVX2()) {
16010 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16011 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16012 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16013 (!isSingleSHUFPSMask(HalfMask) ||
16014 Subtarget.hasFastVariableCrossLaneShuffle()))
16015 return SDValue();
16016 // If this is an unary shuffle (assume that the 2nd operand is
16017 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16018 // are better off extracting the upper half of 1 operand and using a
16019 // narrow shuffle.
16020 if (EltWidth == 64 && V2.isUndef())
16021 return SDValue();
16022 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16023 // full width pshufb, and then merge.
16024 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16025 return SDValue();
16026 }
16027 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16028 if (Subtarget.hasAVX512() && VT.is512BitVector())
16029 return SDValue();
16030 // Extract + narrow shuffle is better than the wide alternative.
16031 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16032 UndefLower, DAG);
16033 }
16034
16035 // Don't extract both uppers, instead shuffle and then extract.
16036 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16037 return SDValue();
16038 }
16039
16040 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16041 if (NumUpperHalves == 0) {
16042 // AVX2 has efficient 64-bit element cross-lane shuffles.
16043 // TODO: Refine to account for unary shuffle, splat, and other masks?
16044 if (Subtarget.hasAVX2() && EltWidth == 64)
16045 return SDValue();
16046 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16047 if (Subtarget.hasAVX512() && VT.is512BitVector())
16048 return SDValue();
16049 // Narrow shuffle + insert is better than the wide alternative.
16050 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16051 UndefLower, DAG);
16052 }
16053
16054 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16055 return SDValue();
16056}
16057
16058/// Handle case where shuffle sources are coming from the same 128-bit lane and
16059/// every lane can be represented as the same repeating mask - allowing us to
16060/// shuffle the sources with the repeating shuffle and then permute the result
16061/// to the destination lanes.
16063 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16064 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16065 int NumElts = VT.getVectorNumElements();
16066 int NumLanes = VT.getSizeInBits() / 128;
16067 int NumLaneElts = NumElts / NumLanes;
16068
16069 // On AVX2 we may be able to just shuffle the lowest elements and then
16070 // broadcast the result.
16071 if (Subtarget.hasAVX2()) {
16072 for (unsigned BroadcastSize : {16, 32, 64}) {
16073 if (BroadcastSize <= VT.getScalarSizeInBits())
16074 continue;
16075 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16076
16077 // Attempt to match a repeating pattern every NumBroadcastElts,
16078 // accounting for UNDEFs but only references the lowest 128-bit
16079 // lane of the inputs.
16080 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16081 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16082 for (int j = 0; j != NumBroadcastElts; ++j) {
16083 int M = Mask[i + j];
16084 if (M < 0)
16085 continue;
16086 int &R = RepeatMask[j];
16087 if (0 != ((M % NumElts) / NumLaneElts))
16088 return false;
16089 if (0 <= R && R != M)
16090 return false;
16091 R = M;
16092 }
16093 return true;
16094 };
16095
16096 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16097 if (!FindRepeatingBroadcastMask(RepeatMask))
16098 continue;
16099
16100 // Shuffle the (lowest) repeated elements in place for broadcast.
16101 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16102
16103 // Shuffle the actual broadcast.
16104 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16105 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16106 for (int j = 0; j != NumBroadcastElts; ++j)
16107 BroadcastMask[i + j] = j;
16108
16109 // Avoid returning the same shuffle operation. For example,
16110 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16111 if (BroadcastMask == Mask)
16112 return SDValue();
16113
16114 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16115 BroadcastMask);
16116 }
16117 }
16118
16119 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16120 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16121 return SDValue();
16122
16123 // Bail if we already have a repeated lane shuffle mask.
16124 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16125 return SDValue();
16126
16127 // Helper to look for repeated mask in each split sublane, and that those
16128 // sublanes can then be permuted into place.
16129 auto ShuffleSubLanes = [&](int SubLaneScale) {
16130 int NumSubLanes = NumLanes * SubLaneScale;
16131 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16132
16133 // Check that all the sources are coming from the same lane and see if we
16134 // can form a repeating shuffle mask (local to each sub-lane). At the same
16135 // time, determine the source sub-lane for each destination sub-lane.
16136 int TopSrcSubLane = -1;
16137 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16138 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16139 SubLaneScale,
16140 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16141
16142 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16143 // Extract the sub-lane mask, check that it all comes from the same lane
16144 // and normalize the mask entries to come from the first lane.
16145 int SrcLane = -1;
16146 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16147 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16148 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16149 if (M < 0)
16150 continue;
16151 int Lane = (M % NumElts) / NumLaneElts;
16152 if ((0 <= SrcLane) && (SrcLane != Lane))
16153 return SDValue();
16154 SrcLane = Lane;
16155 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16156 SubLaneMask[Elt] = LocalM;
16157 }
16158
16159 // Whole sub-lane is UNDEF.
16160 if (SrcLane < 0)
16161 continue;
16162
16163 // Attempt to match against the candidate repeated sub-lane masks.
16164 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16165 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16166 for (int i = 0; i != NumSubLaneElts; ++i) {
16167 if (M1[i] < 0 || M2[i] < 0)
16168 continue;
16169 if (M1[i] != M2[i])
16170 return false;
16171 }
16172 return true;
16173 };
16174
16175 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16176 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16177 continue;
16178
16179 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16180 for (int i = 0; i != NumSubLaneElts; ++i) {
16181 int M = SubLaneMask[i];
16182 if (M < 0)
16183 continue;
16184 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16185 "Unexpected mask element");
16186 RepeatedSubLaneMask[i] = M;
16187 }
16188
16189 // Track the top most source sub-lane - by setting the remaining to
16190 // UNDEF we can greatly simplify shuffle matching.
16191 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16192 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16193 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16194 break;
16195 }
16196
16197 // Bail if we failed to find a matching repeated sub-lane mask.
16198 if (Dst2SrcSubLanes[DstSubLane] < 0)
16199 return SDValue();
16200 }
16201 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16202 "Unexpected source lane");
16203
16204 // Create a repeating shuffle mask for the entire vector.
16205 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16206 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16207 int Lane = SubLane / SubLaneScale;
16208 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16209 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16210 int M = RepeatedSubLaneMask[Elt];
16211 if (M < 0)
16212 continue;
16213 int Idx = (SubLane * NumSubLaneElts) + Elt;
16214 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16215 }
16216 }
16217
16218 // Shuffle each source sub-lane to its destination.
16219 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16220 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16221 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16222 if (SrcSubLane < 0)
16223 continue;
16224 for (int j = 0; j != NumSubLaneElts; ++j)
16225 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16226 }
16227
16228 // Avoid returning the same shuffle operation.
16229 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16230 if (RepeatedMask == Mask || SubLaneMask == Mask)
16231 return SDValue();
16232
16233 SDValue RepeatedShuffle =
16234 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16235
16236 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16237 SubLaneMask);
16238 };
16239
16240 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16241 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16242 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16243 // Otherwise we can only permute whole 128-bit lanes.
16244 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16245 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16246 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16247 MinSubLaneScale = 2;
16248 MaxSubLaneScale =
16249 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16250 }
16251 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16252 MinSubLaneScale = MaxSubLaneScale = 4;
16253
16254 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16255 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16256 return Shuffle;
16257
16258 return SDValue();
16259}
16260
16262 bool &ForceV1Zero, bool &ForceV2Zero,
16263 unsigned &ShuffleImm, ArrayRef<int> Mask,
16264 const APInt &Zeroable) {
16265 int NumElts = VT.getVectorNumElements();
16266 assert(VT.getScalarSizeInBits() == 64 &&
16267 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16268 "Unexpected data type for VSHUFPD");
16269 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16270 "Illegal shuffle mask");
16271
16272 bool ZeroLane[2] = { true, true };
16273 for (int i = 0; i < NumElts; ++i)
16274 ZeroLane[i & 1] &= Zeroable[i];
16275
16276 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16277 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16278 bool IsSHUFPD = true;
16279 bool IsCommutable = true;
16280 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16281 for (int i = 0; i < NumElts; ++i) {
16282 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16283 continue;
16284 if (Mask[i] < 0)
16285 return false;
16286 int Val = (i & 6) + NumElts * (i & 1);
16287 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16288 if (Mask[i] < Val || Mask[i] > Val + 1)
16289 IsSHUFPD = false;
16290 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16291 IsCommutable = false;
16292 SHUFPDMask[i] = Mask[i] % 2;
16293 }
16294
16295 if (!IsSHUFPD && !IsCommutable)
16296 return false;
16297
16298 if (!IsSHUFPD && IsCommutable)
16299 std::swap(V1, V2);
16300
16301 ForceV1Zero = ZeroLane[0];
16302 ForceV2Zero = ZeroLane[1];
16303 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16304 return true;
16305}
16306
16308 SDValue V2, ArrayRef<int> Mask,
16309 const APInt &Zeroable,
16310 const X86Subtarget &Subtarget,
16311 SelectionDAG &DAG) {
16312 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16313 "Unexpected data type for VSHUFPD");
16314
16315 unsigned Immediate = 0;
16316 bool ForceV1Zero = false, ForceV2Zero = false;
16317 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16318 Mask, Zeroable))
16319 return SDValue();
16320
16321 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16322 if (ForceV1Zero)
16323 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16324 if (ForceV2Zero)
16325 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16326
16327 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16328 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16329}
16330
16331// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16332// by zeroable elements in the remaining 24 elements. Turn this into two
16333// vmovqb instructions shuffled together.
16335 SDValue V1, SDValue V2,
16336 ArrayRef<int> Mask,
16337 const APInt &Zeroable,
16338 SelectionDAG &DAG) {
16339 assert(VT == MVT::v32i8 && "Unexpected type!");
16340
16341 // The first 8 indices should be every 8th element.
16342 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16343 return SDValue();
16344
16345 // Remaining elements need to be zeroable.
16346 if (Zeroable.countl_one() < (Mask.size() - 8))
16347 return SDValue();
16348
16349 V1 = DAG.getBitcast(MVT::v4i64, V1);
16350 V2 = DAG.getBitcast(MVT::v4i64, V2);
16351
16352 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16353 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16354
16355 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16356 // the upper bits of the result using an unpckldq.
16357 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16358 { 0, 1, 2, 3, 16, 17, 18, 19,
16359 4, 5, 6, 7, 20, 21, 22, 23 });
16360 // Insert the unpckldq into a zero vector to widen to v32i8.
16361 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16362 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16363 DAG.getVectorIdxConstant(0, DL));
16364}
16365
16366// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16367// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16368// =>
16369// ul = unpckl v1, v2
16370// uh = unpckh v1, v2
16371// a = vperm ul, uh
16372// b = vperm ul, uh
16373//
16374// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16375// and permute. We cannot directly match v3 because it is split into two
16376// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16377// pair of 256-bit shuffles and makes sure the masks are consecutive.
16378//
16379// Once unpck and permute nodes are created, the permute corresponding to this
16380// shuffle is returned, while the other permute replaces the other half of the
16381// shuffle in the selection dag.
16383 SDValue V1, SDValue V2,
16384 ArrayRef<int> Mask,
16385 SelectionDAG &DAG) {
16386 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16387 VT != MVT::v32i8)
16388 return SDValue();
16389 // <B0, B1, B0+1, B1+1, ..., >
16390 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16391 unsigned Begin1) {
16392 size_t Size = Mask.size();
16393 assert(Size % 2 == 0 && "Expected even mask size");
16394 for (unsigned I = 0; I < Size; I += 2) {
16395 if (Mask[I] != (int)(Begin0 + I / 2) ||
16396 Mask[I + 1] != (int)(Begin1 + I / 2))
16397 return false;
16398 }
16399 return true;
16400 };
16401 // Check which half is this shuffle node
16402 int NumElts = VT.getVectorNumElements();
16403 size_t FirstQtr = NumElts / 2;
16404 size_t ThirdQtr = NumElts + NumElts / 2;
16405 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16406 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16407 if (!IsFirstHalf && !IsSecondHalf)
16408 return SDValue();
16409
16410 // Find the intersection between shuffle users of V1 and V2.
16411 SmallVector<SDNode *, 2> Shuffles;
16412 for (SDNode *User : V1->users())
16413 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16414 User->getOperand(1) == V2)
16415 Shuffles.push_back(User);
16416 // Limit user size to two for now.
16417 if (Shuffles.size() != 2)
16418 return SDValue();
16419 // Find out which half of the 512-bit shuffles is each smaller shuffle
16420 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16421 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16422 SDNode *FirstHalf;
16423 SDNode *SecondHalf;
16424 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16425 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16426 FirstHalf = Shuffles[0];
16427 SecondHalf = Shuffles[1];
16428 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16429 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16430 FirstHalf = Shuffles[1];
16431 SecondHalf = Shuffles[0];
16432 } else {
16433 return SDValue();
16434 }
16435 // Lower into unpck and perm. Return the perm of this shuffle and replace
16436 // the other.
16437 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16438 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16439 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16440 DAG.getTargetConstant(0x20, DL, MVT::i8));
16441 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16442 DAG.getTargetConstant(0x31, DL, MVT::i8));
16443 if (IsFirstHalf) {
16444 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16445 return Perm1;
16446 }
16447 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16448 return Perm2;
16449}
16450
16451/// Handle lowering of 4-lane 64-bit floating point shuffles.
16452///
16453/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16454/// isn't available.
16456 const APInt &Zeroable, SDValue V1, SDValue V2,
16457 const X86Subtarget &Subtarget,
16458 SelectionDAG &DAG) {
16459 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16461 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16462
16463 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG))
16465 return V;
16466
16467 if (V2.isUndef()) {
16468 // Check for being able to broadcast a single element.
16469 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16470 Mask, Subtarget, DAG))
16471 return Broadcast;
16472
16473 // Use low duplicate instructions for masks that match their pattern.
16474 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16475 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16476
16477 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16478 // Non-half-crossing single input shuffles can be lowered with an
16479 // interleaved permutation.
16480 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16481 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16482 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16483 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16484 }
16485
16486 // With AVX2 we have direct support for this permutation.
16487 if (Subtarget.hasAVX2())
16488 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16489 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16490
16491 // Try to create an in-lane repeating shuffle mask and then shuffle the
16492 // results into the target lanes.
16494 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16495 return V;
16496
16497 // Try to permute the lanes and then use a per-lane permute.
16498 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16499 Mask, DAG, Subtarget))
16500 return V;
16501
16502 // Otherwise, fall back.
16503 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16504 DAG, Subtarget);
16505 }
16506
16507 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16508 Zeroable, Subtarget, DAG))
16509 return Blend;
16510
16511 // Use dedicated unpack instructions for masks that match their pattern.
16512 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16513 return V;
16514
16515 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16516 Zeroable, Subtarget, DAG))
16517 return Op;
16518
16519 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16520 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16521 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16522 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16523
16524 // If we have lane crossing shuffles AND they don't all come from the lower
16525 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16526 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16527 // canonicalize to a blend of splat which isn't necessary for this combine.
16528 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16529 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16530 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16532 (!Subtarget.hasAVX2() ||
16533 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16534 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16535
16536 // If we have one input in place, then we can permute the other input and
16537 // blend the result.
16538 if (V1IsInPlace || V2IsInPlace)
16539 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16540 Zeroable, Subtarget, DAG);
16541
16542 // Try to create an in-lane repeating shuffle mask and then shuffle the
16543 // results into the target lanes.
16545 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16546 return V;
16547
16548 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16549 // shuffle. However, if we have AVX2 and either inputs are already in place,
16550 // we will be able to shuffle even across lanes the other input in a single
16551 // instruction so skip this pattern.
16552 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16554 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16555 return V;
16556
16557 // If we have VLX support, we can use VEXPAND.
16558 if (Subtarget.hasVLX())
16559 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16560 Zeroable, Subtarget, DAG))
16561 return V;
16562
16563 // If we have AVX2 then we always want to lower with a blend because an v4 we
16564 // can fully permute the elements.
16565 if (Subtarget.hasAVX2())
16566 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16567 Zeroable, Subtarget, DAG);
16568
16569 // Otherwise fall back on generic lowering.
16570 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16571 Subtarget, DAG);
16572}
16573
16574/// Handle lowering of 4-lane 64-bit integer shuffles.
16575///
16576/// This routine is only called when we have AVX2 and thus a reasonable
16577/// instruction set for v4i64 shuffling..
16579 const APInt &Zeroable, SDValue V1, SDValue V2,
16580 const X86Subtarget &Subtarget,
16581 SelectionDAG &DAG) {
16582 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16584 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16585 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16586
16587 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16588 Subtarget, DAG))
16589 return V;
16590
16591 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16592 Zeroable, Subtarget, DAG))
16593 return Blend;
16594
16595 // Check for being able to broadcast a single element.
16596 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16597 Subtarget, DAG))
16598 return Broadcast;
16599
16600 // Try to use shift instructions if fast.
16601 if (Subtarget.preferLowerShuffleAsShift())
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16604 Subtarget, DAG, /*BitwiseOnly*/ true))
16605 return Shift;
16606
16607 if (V2.isUndef()) {
16608 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16609 // can use lower latency instructions that will operate on both lanes.
16610 SmallVector<int, 2> RepeatedMask;
16611 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16612 SmallVector<int, 4> PSHUFDMask;
16613 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16614 return DAG.getBitcast(
16615 MVT::v4i64,
16616 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16617 DAG.getBitcast(MVT::v8i32, V1),
16618 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16619 }
16620
16621 // AVX2 provides a direct instruction for permuting a single input across
16622 // lanes.
16623 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16624 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16625 }
16626
16627 // Try to use shift instructions.
16628 if (SDValue Shift =
16629 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16630 DAG, /*BitwiseOnly*/ false))
16631 return Shift;
16632
16633 // If we have VLX support, we can use VALIGN or VEXPAND.
16634 if (Subtarget.hasVLX()) {
16635 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16636 Zeroable, Subtarget, DAG))
16637 return Rotate;
16638
16639 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16640 Zeroable, Subtarget, DAG))
16641 return V;
16642 }
16643
16644 // Try to use PALIGNR.
16645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16646 Subtarget, DAG))
16647 return Rotate;
16648
16649 // Use dedicated unpack instructions for masks that match their pattern.
16650 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16651 return V;
16652
16653 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16654 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16655
16656 // If we have one input in place, then we can permute the other input and
16657 // blend the result.
16658 if (V1IsInPlace || V2IsInPlace)
16659 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16660 Zeroable, Subtarget, DAG);
16661
16662 // Try to create an in-lane repeating shuffle mask and then shuffle the
16663 // results into the target lanes.
16665 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16666 return V;
16667
16668 // Try to lower to PERMQ(BLENDD(V1,V2)).
16669 if (SDValue V =
16670 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16671 return V;
16672
16673 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16674 // shuffle. However, if we have AVX2 and either inputs are already in place,
16675 // we will be able to shuffle even across lanes the other input in a single
16676 // instruction so skip this pattern.
16677 if (!V1IsInPlace && !V2IsInPlace)
16679 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16680 return Result;
16681
16682 // Otherwise fall back on generic blend lowering.
16683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16684 Zeroable, Subtarget, DAG);
16685}
16686
16687/// Handle lowering of 8-lane 32-bit floating point shuffles.
16688///
16689/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16690/// isn't available.
16692 const APInt &Zeroable, SDValue V1, SDValue V2,
16693 const X86Subtarget &Subtarget,
16694 SelectionDAG &DAG) {
16695 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16697 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16698
16699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16700 Zeroable, Subtarget, DAG))
16701 return Blend;
16702
16703 // Check for being able to broadcast a single element.
16704 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16705 Subtarget, DAG))
16706 return Broadcast;
16707
16708 if (!Subtarget.hasAVX2()) {
16709 SmallVector<int> InLaneMask;
16710 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16711
16712 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16713 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16714 /*SimpleOnly*/ true))
16715 return R;
16716 }
16717 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16718 Zeroable, Subtarget, DAG))
16719 return DAG.getBitcast(MVT::v8f32, ZExt);
16720
16721 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16722 // options to efficiently lower the shuffle.
16723 SmallVector<int, 4> RepeatedMask;
16724 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16725 assert(RepeatedMask.size() == 4 &&
16726 "Repeated masks must be half the mask width!");
16727
16728 // Use even/odd duplicate instructions for masks that match their pattern.
16729 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16730 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16731 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16732 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16733
16734 if (V2.isUndef())
16735 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16736 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16737
16738 // Use dedicated unpack instructions for masks that match their pattern.
16739 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16740 return V;
16741
16742 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16743 // have already handled any direct blends.
16744 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16745 }
16746
16747 // Try to create an in-lane repeating shuffle mask and then shuffle the
16748 // results into the target lanes.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return V;
16752
16753 // If we have a single input shuffle with different shuffle patterns in the
16754 // two 128-bit lanes use the variable mask to VPERMILPS.
16755 if (V2.isUndef()) {
16756 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16757 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16758 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16759 }
16760 if (Subtarget.hasAVX2()) {
16761 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16762 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16763 }
16764 // Otherwise, fall back.
16765 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16766 DAG, Subtarget);
16767 }
16768
16769 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16770 // shuffle.
16772 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16773 return Result;
16774
16775 // If we have VLX support, we can use VEXPAND.
16776 if (Subtarget.hasVLX())
16777 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG))
16779 return V;
16780
16781 // Try to match an interleave of two v8f32s and lower them as unpck and
16782 // permutes using ymms. This needs to go before we try to split the vectors.
16783 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16784 if ((Subtarget.hasAVX2() ||
16787 !Subtarget.hasAVX512())
16788 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16789 Mask, DAG))
16790 return V;
16791
16792 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16793 // since after split we get a more efficient code using vpunpcklwd and
16794 // vpunpckhwd instrs than vblend.
16795 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16796 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16797 Subtarget, DAG);
16798
16799 // If we have AVX2 then we always want to lower with a blend because at v8 we
16800 // can fully permute the elements.
16801 if (Subtarget.hasAVX2())
16802 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG);
16804
16805 // Otherwise fall back on generic lowering.
16806 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16807 Subtarget, DAG);
16808}
16809
16810/// Handle lowering of 8-lane 32-bit integer shuffles.
16811///
16812/// This routine is only called when we have AVX2 and thus a reasonable
16813/// instruction set for v8i32 shuffling..
16815 const APInt &Zeroable, SDValue V1, SDValue V2,
16816 const X86Subtarget &Subtarget,
16817 SelectionDAG &DAG) {
16818 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16820 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16821 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16822
16823 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16824
16825 // Whenever we can lower this as a zext, that instruction is strictly faster
16826 // than any alternative. It also allows us to fold memory operands into the
16827 // shuffle in many cases.
16828 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16829 Zeroable, Subtarget, DAG))
16830 return ZExt;
16831
16832 // Try to match an interleave of two v8i32s and lower them as unpck and
16833 // permutes using ymms. This needs to go before we try to split the vectors.
16834 if (!Subtarget.hasAVX512())
16835 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16836 Mask, DAG))
16837 return V;
16838
16839 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16840 // since after split we get a more efficient code than vblend by using
16841 // vpunpcklwd and vpunpckhwd instrs.
16842 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16843 !Subtarget.hasAVX512())
16844 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16845 Subtarget, DAG);
16846
16847 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16848 Zeroable, Subtarget, DAG))
16849 return Blend;
16850
16851 // Check for being able to broadcast a single element.
16852 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16853 Subtarget, DAG))
16854 return Broadcast;
16855
16856 // Try to use shift instructions if fast.
16857 if (Subtarget.preferLowerShuffleAsShift()) {
16858 if (SDValue Shift =
16859 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG, /*BitwiseOnly*/ true))
16861 return Shift;
16862 if (NumV2Elements == 0)
16863 if (SDValue Rotate =
16864 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16865 return Rotate;
16866 }
16867
16868 // If the shuffle mask is repeated in each 128-bit lane we can use more
16869 // efficient instructions that mirror the shuffles across the two 128-bit
16870 // lanes.
16871 SmallVector<int, 4> RepeatedMask;
16872 bool Is128BitLaneRepeatedShuffle =
16873 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16874 if (Is128BitLaneRepeatedShuffle) {
16875 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16876 if (V2.isUndef())
16877 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16878 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16879
16880 // Use dedicated unpack instructions for masks that match their pattern.
16881 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16882 return V;
16883 }
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16892 if (SDValue Rotate =
16893 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16894 return Rotate;
16895
16896 // If we have VLX support, we can use VALIGN or EXPAND.
16897 if (Subtarget.hasVLX()) {
16898 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16899 Zeroable, Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16903 Zeroable, Subtarget, DAG))
16904 return V;
16905 }
16906
16907 // Try to use byte rotation instructions.
16908 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16909 Subtarget, DAG))
16910 return Rotate;
16911
16912 // Try to create an in-lane repeating shuffle mask and then shuffle the
16913 // results into the target lanes.
16915 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16916 return V;
16917
16918 if (V2.isUndef()) {
16919 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16920 // because that should be faster than the variable permute alternatives.
16921 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16922 return V;
16923
16924 // If the shuffle patterns aren't repeated but it's a single input, directly
16925 // generate a cross-lane VPERMD instruction.
16926 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16927 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16928 }
16929
16930 // Assume that a single SHUFPS is faster than an alternative sequence of
16931 // multiple instructions (even if the CPU has a domain penalty).
16932 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16933 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16934 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16935 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16936 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16937 CastV1, CastV2, DAG);
16938 return DAG.getBitcast(MVT::v8i32, ShufPS);
16939 }
16940
16941 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16942 // shuffle.
16944 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16945 return Result;
16946
16947 // Otherwise fall back on generic blend lowering.
16948 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16949 Zeroable, Subtarget, DAG);
16950}
16951
16952/// Handle lowering of 16-lane 16-bit integer shuffles.
16953///
16954/// This routine is only called when we have AVX2 and thus a reasonable
16955/// instruction set for v16i16 shuffling..
16957 const APInt &Zeroable, SDValue V1, SDValue V2,
16958 const X86Subtarget &Subtarget,
16959 SelectionDAG &DAG) {
16960 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16962 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16963 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16964
16965 // Whenever we can lower this as a zext, that instruction is strictly faster
16966 // than any alternative. It also allows us to fold memory operands into the
16967 // shuffle in many cases.
16969 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16970 return ZExt;
16971
16972 // Check for being able to broadcast a single element.
16973 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16974 Subtarget, DAG))
16975 return Broadcast;
16976
16977 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16978 Zeroable, Subtarget, DAG))
16979 return Blend;
16980
16981 // Use dedicated unpack instructions for masks that match their pattern.
16982 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16983 return V;
16984
16985 // Use dedicated pack instructions for masks that match their pattern.
16986 if (SDValue V =
16987 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 // Try to use lower using a truncation.
16991 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16992 Subtarget, DAG))
16993 return V;
16994
16995 // Try to use shift instructions.
16996 if (SDValue Shift =
16997 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16998 Subtarget, DAG, /*BitwiseOnly*/ false))
16999 return Shift;
17000
17001 // Try to use byte rotation instructions.
17002 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17003 Subtarget, DAG))
17004 return Rotate;
17005
17006 // Try to create an in-lane repeating shuffle mask and then shuffle the
17007 // results into the target lanes.
17009 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17010 return V;
17011
17012 if (V2.isUndef()) {
17013 // Try to use bit rotation instructions.
17014 if (SDValue Rotate =
17015 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17016 return Rotate;
17017
17018 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17019 // because that should be faster than the variable permute alternatives.
17020 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17021 return V;
17022
17023 // There are no generalized cross-lane shuffle operations available on i16
17024 // element types.
17025 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17027 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17028 return V;
17029
17030 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17031 DAG, Subtarget);
17032 }
17033
17034 SmallVector<int, 8> RepeatedMask;
17035 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17036 // As this is a single-input shuffle, the repeated mask should be
17037 // a strictly valid v8i16 mask that we can pass through to the v8i16
17038 // lowering to handle even the v16 case.
17040 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17041 }
17042 }
17043
17044 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17045 Zeroable, Subtarget, DAG))
17046 return PSHUFB;
17047
17048 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17049 if (Subtarget.hasBWI())
17050 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17051
17052 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17053 // shuffle.
17055 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17056 return Result;
17057
17058 // Try to permute the lanes and then use a per-lane permute.
17060 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17061 return V;
17062
17063 // Try to match an interleave of two v16i16s and lower them as unpck and
17064 // permutes using ymms.
17065 if (!Subtarget.hasAVX512())
17066 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17067 Mask, DAG))
17068 return V;
17069
17070 // Otherwise fall back on generic lowering.
17071 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17072 Subtarget, DAG);
17073}
17074
17075/// Handle lowering of 32-lane 8-bit integer shuffles.
17076///
17077/// This routine is only called when we have AVX2 and thus a reasonable
17078/// instruction set for v32i8 shuffling..
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17085 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17086 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17091 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17092 Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Check for being able to broadcast a single element.
17096 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17097 Subtarget, DAG))
17098 return Broadcast;
17099
17100 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17101 Zeroable, Subtarget, DAG))
17102 return Blend;
17103
17104 // Use dedicated unpack instructions for masks that match their pattern.
17105 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17106 return V;
17107
17108 // Use dedicated pack instructions for masks that match their pattern.
17109 if (SDValue V =
17110 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17111 return V;
17112
17113 // Try to use lower using a truncation.
17114 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17115 Subtarget, DAG))
17116 return V;
17117
17118 // Try to use shift instructions.
17119 if (SDValue Shift =
17120 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17121 DAG, /*BitwiseOnly*/ false))
17122 return Shift;
17123
17124 // Try to use byte rotation instructions.
17125 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17126 Subtarget, DAG))
17127 return Rotate;
17128
17129 // Try to use bit rotation instructions.
17130 if (V2.isUndef())
17131 if (SDValue Rotate =
17132 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17133 return Rotate;
17134
17135 // Try to create an in-lane repeating shuffle mask and then shuffle the
17136 // results into the target lanes.
17138 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17139 return V;
17140
17141 // There are no generalized cross-lane shuffle operations available on i8
17142 // element types.
17143 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17144 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17145 // because that should be faster than the variable permute alternatives.
17146 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17147 return V;
17148
17150 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17151 return V;
17152
17153 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17154 DAG, Subtarget);
17155 }
17156
17157 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17158 Zeroable, Subtarget, DAG))
17159 return PSHUFB;
17160
17161 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17162 if (Subtarget.hasVBMI())
17163 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17164
17165 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17166 // shuffle.
17168 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17169 return Result;
17170
17171 // Try to permute the lanes and then use a per-lane permute.
17173 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17174 return V;
17175
17176 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17177 // by zeroable elements in the remaining 24 elements. Turn this into two
17178 // vmovqb instructions shuffled together.
17179 if (Subtarget.hasVLX())
17180 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17181 Mask, Zeroable, DAG))
17182 return V;
17183
17184 // Try to match an interleave of two v32i8s and lower them as unpck and
17185 // permutes using ymms.
17186 if (!Subtarget.hasAVX512())
17187 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17188 Mask, DAG))
17189 return V;
17190
17191 // Otherwise fall back on generic lowering.
17192 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17193 Subtarget, DAG);
17194}
17195
17196/// High-level routine to lower various 256-bit x86 vector shuffles.
17197///
17198/// This routine either breaks down the specific type of a 256-bit x86 vector
17199/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17200/// together based on the available instructions.
17202 SDValue V1, SDValue V2, const APInt &Zeroable,
17203 const X86Subtarget &Subtarget,
17204 SelectionDAG &DAG) {
17205 // If we have a single input to the zero element, insert that into V1 if we
17206 // can do so cheaply.
17207 int NumElts = VT.getVectorNumElements();
17208 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17209
17210 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17212 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17213 return Insertion;
17214
17215 // Handle special cases where the lower or upper half is UNDEF.
17216 if (SDValue V =
17217 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17218 return V;
17219
17220 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17221 // can check for those subtargets here and avoid much of the subtarget
17222 // querying in the per-vector-type lowering routines. With AVX1 we have
17223 // essentially *zero* ability to manipulate a 256-bit vector with integer
17224 // types. Since we'll use floating point types there eventually, just
17225 // immediately cast everything to a float and operate entirely in that domain.
17226 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17227 int ElementBits = VT.getScalarSizeInBits();
17228 if (ElementBits < 32) {
17229 // No floating point type available, if we can't use the bit operations
17230 // for masking/blending then decompose into 128-bit vectors.
17231 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17232 Subtarget, DAG))
17233 return V;
17234 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17235 return V;
17236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17237 }
17238
17239 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17241 V1 = DAG.getBitcast(FpVT, V1);
17242 V2 = DAG.getBitcast(FpVT, V2);
17243 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17244 }
17245
17246 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17247 V1 = DAG.getBitcast(MVT::v16i16, V1);
17248 V2 = DAG.getBitcast(MVT::v16i16, V2);
17249 return DAG.getBitcast(VT,
17250 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17251 }
17252
17253 switch (VT.SimpleTy) {
17254 case MVT::v4f64:
17255 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v4i64:
17257 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v8f32:
17259 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v8i32:
17261 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v16i16:
17263 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v32i8:
17265 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266
17267 default:
17268 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17269 }
17270}
17271
17272/// Try to lower a vector shuffle as a 128-bit shuffles.
17274 const APInt &Zeroable, SDValue V1, SDValue V2,
17275 const X86Subtarget &Subtarget,
17276 SelectionDAG &DAG) {
17277 assert(VT.getScalarSizeInBits() == 64 &&
17278 "Unexpected element type size for 128bit shuffle.");
17279
17280 // To handle 256 bit vector requires VLX and most probably
17281 // function lowerV2X128VectorShuffle() is better solution.
17282 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17283
17284 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17285 SmallVector<int, 4> Widened128Mask;
17286 if (!canWidenShuffleElements(Mask, Widened128Mask))
17287 return SDValue();
17288 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17289
17290 // Try to use an insert into a zero vector.
17291 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17292 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17293 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17294 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17295 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17296 DAG.getVectorIdxConstant(0, DL));
17297 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17298 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17299 DAG.getVectorIdxConstant(0, DL));
17300 }
17301
17302 // Check for patterns which can be matched with a single insert of a 256-bit
17303 // subvector.
17304 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17305 if (OnlyUsesV1 ||
17306 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17307 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17308 SDValue SubVec =
17309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17310 DAG.getVectorIdxConstant(0, DL));
17311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17312 DAG.getVectorIdxConstant(4, DL));
17313 }
17314
17315 // See if this is an insertion of the lower 128-bits of V2 into V1.
17316 bool IsInsert = true;
17317 int V2Index = -1;
17318 for (int i = 0; i < 4; ++i) {
17319 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17320 if (Widened128Mask[i] < 0)
17321 continue;
17322
17323 // Make sure all V1 subvectors are in place.
17324 if (Widened128Mask[i] < 4) {
17325 if (Widened128Mask[i] != i) {
17326 IsInsert = false;
17327 break;
17328 }
17329 } else {
17330 // Make sure we only have a single V2 index and its the lowest 128-bits.
17331 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17332 IsInsert = false;
17333 break;
17334 }
17335 V2Index = i;
17336 }
17337 }
17338 if (IsInsert && V2Index >= 0) {
17339 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17340 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17341 DAG.getVectorIdxConstant(0, DL));
17342 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17343 }
17344
17345 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17346 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17347 // possible we at least ensure the lanes stay sequential to help later
17348 // combines.
17349 SmallVector<int, 2> Widened256Mask;
17350 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17351 Widened128Mask.clear();
17352 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17353 }
17354
17355 // Try to lower to vshuf64x2/vshuf32x4.
17356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17357 int PermMask[4] = {-1, -1, -1, -1};
17358 // Ensure elements came from the same Op.
17359 for (int i = 0; i < 4; ++i) {
17360 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17361 if (Widened128Mask[i] < 0)
17362 continue;
17363
17364 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17365 unsigned OpIndex = i / 2;
17366 if (Ops[OpIndex].isUndef())
17367 Ops[OpIndex] = Op;
17368 else if (Ops[OpIndex] != Op)
17369 return SDValue();
17370
17371 PermMask[i] = Widened128Mask[i] % 4;
17372 }
17373
17374 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17375 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17376}
17377
17378/// Handle lowering of 8-lane 64-bit floating point shuffles.
17380 const APInt &Zeroable, SDValue V1, SDValue V2,
17381 const X86Subtarget &Subtarget,
17382 SelectionDAG &DAG) {
17383 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17385 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17386
17387 if (V2.isUndef()) {
17388 // Use low duplicate instructions for masks that match their pattern.
17389 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17390 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17391
17392 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17393 // Non-half-crossing single input shuffles can be lowered with an
17394 // interleaved permutation.
17395 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17396 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17397 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17398 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17399 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17400 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17401 }
17402
17403 SmallVector<int, 4> RepeatedMask;
17404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17405 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17406 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17407 }
17408
17409 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17410 V2, Subtarget, DAG))
17411 return Shuf128;
17412
17413 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17414 return Unpck;
17415
17416 // Check if the blend happens to exactly fit that of SHUFPD.
17417 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17418 Zeroable, Subtarget, DAG))
17419 return Op;
17420
17421 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17422 Subtarget, DAG))
17423 return V;
17424
17425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17426 Zeroable, Subtarget, DAG))
17427 return Blend;
17428
17429 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17430}
17431
17432/// Handle lowering of 16-lane 32-bit floating point shuffles.
17434 const APInt &Zeroable, SDValue V1, SDValue V2,
17435 const X86Subtarget &Subtarget,
17436 SelectionDAG &DAG) {
17437 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17439 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17440
17441 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17442 // options to efficiently lower the shuffle.
17443 SmallVector<int, 4> RepeatedMask;
17444 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17445 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17446
17447 // Use even/odd duplicate instructions for masks that match their pattern.
17448 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17449 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17450 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17451 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17452
17453 if (V2.isUndef())
17454 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17455 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17456
17457 // Use dedicated unpack instructions for masks that match their pattern.
17458 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17459 return V;
17460
17461 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17462 Zeroable, Subtarget, DAG))
17463 return Blend;
17464
17465 // Otherwise, fall back to a SHUFPS sequence.
17466 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17467 }
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17474 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17475 return DAG.getBitcast(MVT::v16f32, ZExt);
17476
17477 // Try to create an in-lane repeating shuffle mask and then shuffle the
17478 // results into the target lanes.
17480 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17481 return V;
17482
17483 // If we have a single input shuffle with different shuffle patterns in the
17484 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17485 if (V2.isUndef() &&
17486 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17487 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17488 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17489 }
17490
17491 // If we have AVX512F support, we can use VEXPAND.
17492 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17493 Zeroable, Subtarget, DAG))
17494 return V;
17495
17496 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17497}
17498
17499/// Handle lowering of 8-lane 64-bit integer shuffles.
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17506 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17507
17508 // Try to use shift instructions if fast.
17509 if (Subtarget.preferLowerShuffleAsShift())
17510 if (SDValue Shift =
17511 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17512 Subtarget, DAG, /*BitwiseOnly*/ true))
17513 return Shift;
17514
17515 if (V2.isUndef()) {
17516 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17517 // can use lower latency instructions that will operate on all four
17518 // 128-bit lanes.
17519 SmallVector<int, 2> Repeated128Mask;
17520 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17521 SmallVector<int, 4> PSHUFDMask;
17522 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17523 return DAG.getBitcast(
17524 MVT::v8i64,
17525 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17526 DAG.getBitcast(MVT::v16i32, V1),
17527 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17528 }
17529
17530 SmallVector<int, 4> Repeated256Mask;
17531 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17532 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17533 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17534 }
17535
17536 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17537 V2, Subtarget, DAG))
17538 return Shuf128;
17539
17540 // Try to use shift instructions.
17541 if (SDValue Shift =
17542 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17543 DAG, /*BitwiseOnly*/ false))
17544 return Shift;
17545
17546 // Try to use VALIGN.
17547 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17548 Zeroable, Subtarget, DAG))
17549 return Rotate;
17550
17551 // Try to use PALIGNR.
17552 if (Subtarget.hasBWI())
17553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17554 Subtarget, DAG))
17555 return Rotate;
17556
17557 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17558 return Unpck;
17559
17560 // If we have AVX512F support, we can use VEXPAND.
17561 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564
17565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17566 Zeroable, Subtarget, DAG))
17567 return Blend;
17568
17569 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17570}
17571
17572/// Handle lowering of 16-lane 32-bit integer shuffles.
17574 const APInt &Zeroable, SDValue V1, SDValue V2,
17575 const X86Subtarget &Subtarget,
17576 SelectionDAG &DAG) {
17577 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17579 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17580
17581 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17582
17583 // Whenever we can lower this as a zext, that instruction is strictly faster
17584 // than any alternative. It also allows us to fold memory operands into the
17585 // shuffle in many cases.
17587 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17588 return ZExt;
17589
17590 // Try to use shift instructions if fast.
17591 if (Subtarget.preferLowerShuffleAsShift()) {
17592 if (SDValue Shift =
17593 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17594 Subtarget, DAG, /*BitwiseOnly*/ true))
17595 return Shift;
17596 if (NumV2Elements == 0)
17597 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17598 Subtarget, DAG))
17599 return Rotate;
17600 }
17601
17602 // If the shuffle mask is repeated in each 128-bit lane we can use more
17603 // efficient instructions that mirror the shuffles across the four 128-bit
17604 // lanes.
17605 SmallVector<int, 4> RepeatedMask;
17606 bool Is128BitLaneRepeatedShuffle =
17607 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17608 if (Is128BitLaneRepeatedShuffle) {
17609 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17610 if (V2.isUndef())
17611 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17612 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17613
17614 // Use dedicated unpack instructions for masks that match their pattern.
17615 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17616 return V;
17617 }
17618
17619 // Try to use shift instructions.
17620 if (SDValue Shift =
17621 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17622 Subtarget, DAG, /*BitwiseOnly*/ false))
17623 return Shift;
17624
17625 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17626 if (SDValue Rotate =
17627 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17628 return Rotate;
17629
17630 // Try to use VALIGN.
17631 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17632 Zeroable, Subtarget, DAG))
17633 return Rotate;
17634
17635 // Try to use byte rotation instructions.
17636 if (Subtarget.hasBWI())
17637 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17638 Subtarget, DAG))
17639 return Rotate;
17640
17641 // Assume that a single SHUFPS is faster than using a permv shuffle.
17642 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17643 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17644 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17645 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17646 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17647 CastV1, CastV2, DAG);
17648 return DAG.getBitcast(MVT::v16i32, ShufPS);
17649 }
17650
17651 // Try to create an in-lane repeating shuffle mask and then shuffle the
17652 // results into the target lanes.
17654 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17655 return V;
17656
17657 // If we have AVX512F support, we can use VEXPAND.
17658 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17659 Zeroable, Subtarget, DAG))
17660 return V;
17661
17662 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17663 Zeroable, Subtarget, DAG))
17664 return Blend;
17665
17666 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17667}
17668
17669/// Handle lowering of 32-lane 16-bit integer shuffles.
17671 const APInt &Zeroable, SDValue V1, SDValue V2,
17672 const X86Subtarget &Subtarget,
17673 SelectionDAG &DAG) {
17674 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17676 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17677 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17678
17679 // Whenever we can lower this as a zext, that instruction is strictly faster
17680 // than any alternative. It also allows us to fold memory operands into the
17681 // shuffle in many cases.
17683 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17684 return ZExt;
17685
17686 // Use dedicated unpack instructions for masks that match their pattern.
17687 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17688 return V;
17689
17690 // Use dedicated pack instructions for masks that match their pattern.
17691 if (SDValue V =
17692 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17693 return V;
17694
17695 // Try to use shift instructions.
17696 if (SDValue Shift =
17697 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17698 Subtarget, DAG, /*BitwiseOnly*/ false))
17699 return Shift;
17700
17701 // Try to use byte rotation instructions.
17702 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17703 Subtarget, DAG))
17704 return Rotate;
17705
17706 if (V2.isUndef()) {
17707 // Try to use bit rotation instructions.
17708 if (SDValue Rotate =
17709 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17710 return Rotate;
17711
17712 SmallVector<int, 8> RepeatedMask;
17713 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17714 // As this is a single-input shuffle, the repeated mask should be
17715 // a strictly valid v8i16 mask that we can pass through to the v8i16
17716 // lowering to handle even the v32 case.
17717 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17718 RepeatedMask, Subtarget, DAG);
17719 }
17720 }
17721
17722 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17723 Zeroable, Subtarget, DAG))
17724 return Blend;
17725
17726 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17727 Zeroable, Subtarget, DAG))
17728 return PSHUFB;
17729
17730 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17731 // shuffle.
17732 if (!V2.isUndef())
17734 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17735 return Result;
17736
17737 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17738}
17739
17740/// Handle lowering of 64-lane 8-bit integer shuffles.
17742 const APInt &Zeroable, SDValue V1, SDValue V2,
17743 const X86Subtarget &Subtarget,
17744 SelectionDAG &DAG) {
17745 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17747 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17748 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17749
17750 // Whenever we can lower this as a zext, that instruction is strictly faster
17751 // than any alternative. It also allows us to fold memory operands into the
17752 // shuffle in many cases.
17754 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17755 return ZExt;
17756
17757 // Use dedicated unpack instructions for masks that match their pattern.
17758 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17759 return V;
17760
17761 // Use dedicated pack instructions for masks that match their pattern.
17762 if (SDValue V =
17763 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17764 return V;
17765
17766 // Try to use shift instructions.
17767 if (SDValue Shift =
17768 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17769 DAG, /*BitwiseOnly*/ false))
17770 return Shift;
17771
17772 // Try to use byte rotation instructions.
17773 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17774 Subtarget, DAG))
17775 return Rotate;
17776
17777 // Try to use bit rotation instructions.
17778 if (V2.isUndef())
17779 if (SDValue Rotate =
17780 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17781 return Rotate;
17782
17783 // Lower as AND if possible.
17784 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17785 Zeroable, Subtarget, DAG))
17786 return Masked;
17787
17788 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17789 Zeroable, Subtarget, DAG))
17790 return PSHUFB;
17791
17792 // Try to create an in-lane repeating shuffle mask and then shuffle the
17793 // results into the target lanes.
17795 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17796 return V;
17797
17799 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17800 return Result;
17801
17802 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17803 Zeroable, Subtarget, DAG))
17804 return Blend;
17805
17806 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17807 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17808 // PALIGNR will be cheaper than the second PSHUFB+OR.
17809 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17810 Mask, Subtarget, DAG))
17811 return V;
17812
17813 // If we can't directly blend but can use PSHUFB, that will be better as it
17814 // can both shuffle and set up the inefficient blend.
17815 bool V1InUse, V2InUse;
17816 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17817 DAG, V1InUse, V2InUse);
17818 }
17819
17820 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17821 // shuffle.
17822 if (!V2.isUndef())
17824 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17825 return Result;
17826
17827 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17828 if (Subtarget.hasVBMI())
17829 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17830
17831 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17832}
17833
17834/// High-level routine to lower various 512-bit x86 vector shuffles.
17835///
17836/// This routine either breaks down the specific type of a 512-bit x86 vector
17837/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17838/// together based on the available instructions.
17840 MVT VT, SDValue V1, SDValue V2,
17841 const APInt &Zeroable,
17842 const X86Subtarget &Subtarget,
17843 SelectionDAG &DAG) {
17844 assert(Subtarget.hasAVX512() &&
17845 "Cannot lower 512-bit vectors w/ basic ISA!");
17846
17847 // If we have a single input to the zero element, insert that into V1 if we
17848 // can do so cheaply.
17849 int NumElts = Mask.size();
17850 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17851
17852 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17854 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17855 return Insertion;
17856
17857 // Handle special cases where the lower or upper half is UNDEF.
17858 if (SDValue V =
17859 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17860 return V;
17861
17862 // Check for being able to broadcast a single element.
17863 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17864 Subtarget, DAG))
17865 return Broadcast;
17866
17867 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17868 // Try using bit ops for masking and blending before falling back to
17869 // splitting.
17870 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17871 Subtarget, DAG))
17872 return V;
17873 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17874 return V;
17875
17876 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17877 }
17878
17879 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17880 if (!Subtarget.hasBWI())
17881 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17882 /*SimpleOnly*/ false);
17883
17884 V1 = DAG.getBitcast(MVT::v32i16, V1);
17885 V2 = DAG.getBitcast(MVT::v32i16, V2);
17886 return DAG.getBitcast(VT,
17887 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17888 }
17889
17890 // Dispatch to each element type for lowering. If we don't have support for
17891 // specific element type shuffles at 512 bits, immediately split them and
17892 // lower them. Each lowering routine of a given type is allowed to assume that
17893 // the requisite ISA extensions for that element type are available.
17894 switch (VT.SimpleTy) {
17895 case MVT::v8f64:
17896 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17897 case MVT::v16f32:
17898 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v8i64:
17900 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v16i32:
17902 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v32i16:
17904 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v64i8:
17906 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907
17908 default:
17909 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17910 }
17911}
17912
17914 MVT VT, SDValue V1, SDValue V2,
17915 const X86Subtarget &Subtarget,
17916 SelectionDAG &DAG) {
17917 // Shuffle should be unary.
17918 if (!V2.isUndef())
17919 return SDValue();
17920
17921 int ShiftAmt = -1;
17922 int NumElts = Mask.size();
17923 for (int i = 0; i != NumElts; ++i) {
17924 int M = Mask[i];
17925 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17926 "Unexpected mask index.");
17927 if (M < 0)
17928 continue;
17929
17930 // The first non-undef element determines our shift amount.
17931 if (ShiftAmt < 0) {
17932 ShiftAmt = M - i;
17933 // Need to be shifting right.
17934 if (ShiftAmt <= 0)
17935 return SDValue();
17936 }
17937 // All non-undef elements must shift by the same amount.
17938 if (ShiftAmt != M - i)
17939 return SDValue();
17940 }
17941 assert(ShiftAmt >= 0 && "All undef?");
17942
17943 // Great we found a shift right.
17944 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17945 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17946 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17948 DAG.getVectorIdxConstant(0, DL));
17949}
17950
17951// Determine if this shuffle can be implemented with a KSHIFT instruction.
17952// Returns the shift amount if possible or -1 if not. This is a simplified
17953// version of matchShuffleAsShift.
17954static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17955 int MaskOffset, const APInt &Zeroable) {
17956 int Size = Mask.size();
17957
17958 auto CheckZeros = [&](int Shift, bool Left) {
17959 for (int j = 0; j < Shift; ++j)
17960 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17961 return false;
17962
17963 return true;
17964 };
17965
17966 auto MatchShift = [&](int Shift, bool Left) {
17967 unsigned Pos = Left ? Shift : 0;
17968 unsigned Low = Left ? 0 : Shift;
17969 unsigned Len = Size - Shift;
17970 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17971 };
17972
17973 for (int Shift = 1; Shift != Size; ++Shift)
17974 for (bool Left : {true, false})
17975 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17977 return Shift;
17978 }
17979
17980 return -1;
17981}
17982
17983
17984// Lower vXi1 vector shuffles.
17985// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17986// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17987// vector, shuffle and then truncate it back.
17989 MVT VT, SDValue V1, SDValue V2,
17990 const APInt &Zeroable,
17991 const X86Subtarget &Subtarget,
17992 SelectionDAG &DAG) {
17993 assert(Subtarget.hasAVX512() &&
17994 "Cannot lower 512-bit vectors w/o basic ISA!");
17995
17996 int NumElts = Mask.size();
17997 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17998
17999 // Try to recognize shuffles that are just padding a subvector with zeros.
18000 int SubvecElts = 0;
18001 int Src = -1;
18002 for (int i = 0; i != NumElts; ++i) {
18003 if (Mask[i] >= 0) {
18004 // Grab the source from the first valid mask. All subsequent elements need
18005 // to use this same source.
18006 if (Src < 0)
18007 Src = Mask[i] / NumElts;
18008 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18009 break;
18010 }
18011
18012 ++SubvecElts;
18013 }
18014 assert(SubvecElts != NumElts && "Identity shuffle?");
18015
18016 // Clip to a power 2.
18017 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18018
18019 // Make sure the number of zeroable bits in the top at least covers the bits
18020 // not covered by the subvector.
18021 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18022 assert(Src >= 0 && "Expected a source!");
18023 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18024 SDValue Extract =
18025 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18026 DAG.getVectorIdxConstant(0, DL));
18027 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18028 DAG.getConstant(0, DL, VT), Extract,
18029 DAG.getVectorIdxConstant(0, DL));
18030 }
18031
18032 // Try a simple shift right with undef elements. Later we'll try with zeros.
18033 if (SDValue Shift =
18034 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18035 return Shift;
18036
18037 // Try to match KSHIFTs.
18038 unsigned Offset = 0;
18039 for (SDValue V : {V1, V2}) {
18040 unsigned Opcode;
18041 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18042 if (ShiftAmt >= 0) {
18043 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18044 MVT WideVT = Res.getSimpleValueType();
18045 // Widened right shifts need two shifts to ensure we shift in zeroes.
18046 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18047 int WideElts = WideVT.getVectorNumElements();
18048 // Shift left to put the original vector in the MSBs of the new size.
18049 Res =
18050 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18051 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18052 // Increase the shift amount to account for the left shift.
18053 ShiftAmt += WideElts - NumElts;
18054 }
18055
18056 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18057 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18059 DAG.getVectorIdxConstant(0, DL));
18060 }
18061 Offset += NumElts; // Increment for next iteration.
18062 }
18063
18064 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18065 // ops instead.
18066 // TODO: What other unary shuffles would benefit from this?
18067 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18068 SDValue Op0 = V1.getOperand(0);
18069 SDValue Op1 = V1.getOperand(1);
18071 EVT OpVT = Op0.getValueType();
18072 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18073 return DAG.getSetCC(
18074 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18075 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18076 }
18077
18078 MVT ExtVT;
18079 switch (VT.SimpleTy) {
18080 default:
18081 llvm_unreachable("Expected a vector of i1 elements");
18082 case MVT::v2i1:
18083 ExtVT = MVT::v2i64;
18084 break;
18085 case MVT::v4i1:
18086 ExtVT = MVT::v4i32;
18087 break;
18088 case MVT::v8i1:
18089 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18090 // shuffle.
18091 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18092 break;
18093 case MVT::v16i1:
18094 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18095 // 256-bit operation available.
18096 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18097 break;
18098 case MVT::v32i1:
18099 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18100 // 256-bit operation available.
18101 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18102 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18103 break;
18104 case MVT::v64i1:
18105 // Fall back to scalarization. FIXME: We can do better if the shuffle
18106 // can be partitioned cleanly.
18107 if (!Subtarget.useBWIRegs())
18108 return SDValue();
18109 ExtVT = MVT::v64i8;
18110 break;
18111 }
18112
18113 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18114 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18115
18116 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18117 // i1 was sign extended we can use X86ISD::CVT2MASK.
18118 int NumElems = VT.getVectorNumElements();
18119 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18120 (Subtarget.hasDQI() && (NumElems < 32)))
18121 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18122 Shuffle, ISD::SETGT);
18123
18124 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18125}
18126
18127/// Helper function that returns true if the shuffle mask should be
18128/// commuted to improve canonicalization.
18130 int NumElements = Mask.size();
18131
18132 int NumV1Elements = 0, NumV2Elements = 0;
18133 for (int M : Mask)
18134 if (M < 0)
18135 continue;
18136 else if (M < NumElements)
18137 ++NumV1Elements;
18138 else
18139 ++NumV2Elements;
18140
18141 // Commute the shuffle as needed such that more elements come from V1 than
18142 // V2. This allows us to match the shuffle pattern strictly on how many
18143 // elements come from V1 without handling the symmetric cases.
18144 if (NumV2Elements > NumV1Elements)
18145 return true;
18146
18147 assert(NumV1Elements > 0 && "No V1 indices");
18148
18149 if (NumV2Elements == 0)
18150 return false;
18151
18152 // When the number of V1 and V2 elements are the same, try to minimize the
18153 // number of uses of V2 in the low half of the vector. When that is tied,
18154 // ensure that the sum of indices for V1 is equal to or lower than the sum
18155 // indices for V2. When those are equal, try to ensure that the number of odd
18156 // indices for V1 is lower than the number of odd indices for V2.
18157 if (NumV1Elements == NumV2Elements) {
18158 int LowV1Elements = 0, LowV2Elements = 0;
18159 for (int M : Mask.slice(0, NumElements / 2))
18160 if (M >= NumElements)
18161 ++LowV2Elements;
18162 else if (M >= 0)
18163 ++LowV1Elements;
18164 if (LowV2Elements > LowV1Elements)
18165 return true;
18166 if (LowV2Elements == LowV1Elements) {
18167 int SumV1Indices = 0, SumV2Indices = 0;
18168 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18169 if (Mask[i] >= NumElements)
18170 SumV2Indices += i;
18171 else if (Mask[i] >= 0)
18172 SumV1Indices += i;
18173 if (SumV2Indices < SumV1Indices)
18174 return true;
18175 if (SumV2Indices == SumV1Indices) {
18176 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18177 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18178 if (Mask[i] >= NumElements)
18179 NumV2OddIndices += i % 2;
18180 else if (Mask[i] >= 0)
18181 NumV1OddIndices += i % 2;
18182 if (NumV2OddIndices < NumV1OddIndices)
18183 return true;
18184 }
18185 }
18186 }
18187
18188 return false;
18189}
18190
18192 const X86Subtarget &Subtarget) {
18193 if (!Subtarget.hasAVX512())
18194 return false;
18195
18196 if (!V.getValueType().isSimple())
18197 return false;
18198
18199 MVT VT = V.getSimpleValueType().getScalarType();
18200 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18201 return false;
18202
18203 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18204 // are preferable to blendw/blendvb/masked-mov.
18205 if ((VT == MVT::i16 || VT == MVT::i8) &&
18206 V.getSimpleValueType().getSizeInBits() < 512)
18207 return false;
18208
18209 auto HasMaskOperation = [&](SDValue V) {
18210 // TODO: Currently we only check limited opcode. We probably extend
18211 // it to all binary operation by checking TLI.isBinOp().
18212 switch (V->getOpcode()) {
18213 default:
18214 return false;
18215 case ISD::ADD:
18216 case ISD::SUB:
18217 case ISD::AND:
18218 case ISD::XOR:
18219 case ISD::OR:
18220 case ISD::SMAX:
18221 case ISD::SMIN:
18222 case ISD::UMAX:
18223 case ISD::UMIN:
18224 case ISD::ABS:
18225 case ISD::SHL:
18226 case ISD::SRL:
18227 case ISD::SRA:
18228 case ISD::MUL:
18229 break;
18230 }
18231 if (!V->hasOneUse())
18232 return false;
18233
18234 return true;
18235 };
18236
18237 if (HasMaskOperation(V))
18238 return true;
18239
18240 return false;
18241}
18242
18243// Forward declaration.
18246 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18247 const X86Subtarget &Subtarget);
18248
18249 /// Top-level lowering for x86 vector shuffles.
18250///
18251/// This handles decomposition, canonicalization, and lowering of all x86
18252/// vector shuffles. Most of the specific lowering strategies are encapsulated
18253/// above in helper routines. The canonicalization attempts to widen shuffles
18254/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18255/// s.t. only one of the two inputs needs to be tested, etc.
18257 SelectionDAG &DAG) {
18259 ArrayRef<int> OrigMask = SVOp->getMask();
18260 SDValue V1 = Op.getOperand(0);
18261 SDValue V2 = Op.getOperand(1);
18262 MVT VT = Op.getSimpleValueType();
18263 int NumElements = VT.getVectorNumElements();
18264 SDLoc DL(Op);
18265 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18266
18267 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18268 "Can't lower MMX shuffles");
18269
18270 bool V1IsUndef = V1.isUndef();
18271 bool V2IsUndef = V2.isUndef();
18272 if (V1IsUndef && V2IsUndef)
18273 return DAG.getUNDEF(VT);
18274
18275 // When we create a shuffle node we put the UNDEF node to second operand,
18276 // but in some cases the first operand may be transformed to UNDEF.
18277 // In this case we should just commute the node.
18278 if (V1IsUndef)
18279 return DAG.getCommutedVectorShuffle(*SVOp);
18280
18281 // Check for non-undef masks pointing at an undef vector and make the masks
18282 // undef as well. This makes it easier to match the shuffle based solely on
18283 // the mask.
18284 if (V2IsUndef &&
18285 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18286 SmallVector<int, 8> NewMask(OrigMask);
18287 for (int &M : NewMask)
18288 if (M >= NumElements)
18289 M = -1;
18290 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18291 }
18292
18293 // Check for illegal shuffle mask element index values.
18294 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18295 (void)MaskUpperLimit;
18296 assert(llvm::all_of(OrigMask,
18297 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18298 "Out of bounds shuffle index");
18299
18300 // We actually see shuffles that are entirely re-arrangements of a set of
18301 // zero inputs. This mostly happens while decomposing complex shuffles into
18302 // simple ones. Directly lower these as a buildvector of zeros.
18303 APInt KnownUndef, KnownZero;
18304 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18305
18306 APInt Zeroable = KnownUndef | KnownZero;
18307 if (Zeroable.isAllOnes())
18308 return getZeroVector(VT, Subtarget, DAG, DL);
18309
18310 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18311
18312 // Try to collapse shuffles into using a vector type with fewer elements but
18313 // wider element types. We cap this to not form integers or floating point
18314 // elements wider than 64 bits. It does not seem beneficial to form i128
18315 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18316 SmallVector<int, 16> WidenedMask;
18317 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18318 !canCombineAsMaskOperation(V1, Subtarget) &&
18319 !canCombineAsMaskOperation(V2, Subtarget) &&
18320 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18321 // Shuffle mask widening should not interfere with a broadcast opportunity
18322 // by obfuscating the operands with bitcasts.
18323 // TODO: Avoid lowering directly from this top-level function: make this
18324 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18325 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18326 Subtarget, DAG))
18327 return Broadcast;
18328
18329 MVT NewEltVT = VT.isFloatingPoint()
18332 int NewNumElts = NumElements / 2;
18333 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18334 // Make sure that the new vector type is legal. For example, v2f64 isn't
18335 // legal on SSE1.
18336 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18337 if (V2IsZero) {
18338 // Modify the new Mask to take all zeros from the all-zero vector.
18339 // Choose indices that are blend-friendly.
18340 bool UsedZeroVector = false;
18341 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18342 "V2's non-undef elements are used?!");
18343 for (int i = 0; i != NewNumElts; ++i)
18344 if (WidenedMask[i] == SM_SentinelZero) {
18345 WidenedMask[i] = i + NewNumElts;
18346 UsedZeroVector = true;
18347 }
18348 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18349 // some elements to be undef.
18350 if (UsedZeroVector)
18351 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18352 }
18353 V1 = DAG.getBitcast(NewVT, V1);
18354 V2 = DAG.getBitcast(NewVT, V2);
18355 return DAG.getBitcast(
18356 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18357 }
18358 }
18359
18360 SmallVector<SDValue> Ops = {V1, V2};
18361 SmallVector<int> Mask(OrigMask);
18362
18363 // Canonicalize the shuffle with any horizontal ops inputs.
18364 // NOTE: This may update Ops and Mask.
18366 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18367 return DAG.getBitcast(VT, HOp);
18368
18369 V1 = DAG.getBitcast(VT, Ops[0]);
18370 V2 = DAG.getBitcast(VT, Ops[1]);
18371 assert(NumElements == (int)Mask.size() &&
18372 "canonicalizeShuffleMaskWithHorizOp "
18373 "shouldn't alter the shuffle mask size");
18374
18375 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18376 // These will be materialized uniformly anyway, so make splat matching easier.
18377 // TODO: Allow all int constants?
18378 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18379 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18380 BitVector Undefs;
18381 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18382 if (Undefs.any() &&
18385 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18386 }
18387 }
18388 }
18389 return V;
18390 };
18391 V1 = CanonicalizeConstant(V1);
18392 V2 = CanonicalizeConstant(V2);
18393
18394 // Commute the shuffle if it will improve canonicalization.
18397 std::swap(V1, V2);
18398 }
18399
18400 // For each vector width, delegate to a specialized lowering routine.
18401 if (VT.is128BitVector())
18402 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18403
18404 if (VT.is256BitVector())
18405 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18406
18407 if (VT.is512BitVector())
18408 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18409
18410 if (Is1BitVector)
18411 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18412
18413 llvm_unreachable("Unimplemented!");
18414}
18415
18416// As legal vpcompress instructions depend on various AVX512 extensions, try to
18417// convert illegal vector sizes to legal ones to avoid expansion.
18419 SelectionDAG &DAG) {
18420 assert(Subtarget.hasAVX512() &&
18421 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18422
18423 SDLoc DL(Op);
18424 SDValue Vec = Op.getOperand(0);
18425 SDValue Mask = Op.getOperand(1);
18426 SDValue Passthru = Op.getOperand(2);
18427
18428 EVT VecVT = Vec.getValueType();
18429 EVT ElementVT = VecVT.getVectorElementType();
18430 unsigned NumElements = VecVT.getVectorNumElements();
18431 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18432 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18433
18434 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18435 // compressed as 512-bit vectors in AVX512F.
18436 if (NumVecBits != 128 && NumVecBits != 256)
18437 return SDValue();
18438
18439 if (NumElementBits == 32 || NumElementBits == 64) {
18440 unsigned NumLargeElements = 512 / NumElementBits;
18441 MVT LargeVecVT =
18442 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18443 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18444
18445 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18446 DAG, DL);
18447 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18448 Subtarget, DAG, DL);
18449 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18450 : widenSubVector(LargeVecVT, Passthru,
18451 /*ZeroNewElements=*/false,
18452 Subtarget, DAG, DL);
18453
18454 SDValue Compressed =
18455 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18456 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18457 DAG.getConstant(0, DL, MVT::i64));
18458 }
18459
18460 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18461 VecVT == MVT::v16i16) {
18462 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18463 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18464
18465 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18466 Passthru = Passthru.isUndef()
18467 ? DAG.getUNDEF(LargeVecVT)
18468 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18469
18470 SDValue Compressed =
18471 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18472 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18473 }
18474
18475 return SDValue();
18476}
18477
18478/// Try to lower a VSELECT instruction to a vector shuffle.
18480 const X86Subtarget &Subtarget,
18481 SelectionDAG &DAG) {
18482 SDValue Cond = Op.getOperand(0);
18483 SDValue LHS = Op.getOperand(1);
18484 SDValue RHS = Op.getOperand(2);
18485 MVT VT = Op.getSimpleValueType();
18486
18487 // Only non-legal VSELECTs reach this lowering, convert those into generic
18488 // shuffles and re-use the shuffle lowering path for blends.
18492 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18493 }
18494
18495 return SDValue();
18496}
18497
18498SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18499 SDValue Cond = Op.getOperand(0);
18500 SDValue LHS = Op.getOperand(1);
18501 SDValue RHS = Op.getOperand(2);
18502
18503 SDLoc dl(Op);
18504 MVT VT = Op.getSimpleValueType();
18505 if (isSoftF16(VT, Subtarget)) {
18506 MVT NVT = VT.changeVectorElementTypeToInteger();
18507 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18508 DAG.getBitcast(NVT, LHS),
18509 DAG.getBitcast(NVT, RHS)));
18510 }
18511
18512 // A vselect where all conditions and data are constants can be optimized into
18513 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18517 return SDValue();
18518
18519 // Try to lower this to a blend-style vector shuffle. This can handle all
18520 // constant condition cases.
18521 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18522 return BlendOp;
18523
18524 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18525 // with patterns on the mask registers on AVX-512.
18526 MVT CondVT = Cond.getSimpleValueType();
18527 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18528 if (CondEltSize == 1)
18529 return Op;
18530
18531 // Variable blends are only legal from SSE4.1 onward.
18532 if (!Subtarget.hasSSE41())
18533 return SDValue();
18534
18535 unsigned EltSize = VT.getScalarSizeInBits();
18536 unsigned NumElts = VT.getVectorNumElements();
18537
18538 // Expand v32i16/v64i8 without BWI.
18539 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18540 return SDValue();
18541
18542 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18543 // into an i1 condition so that we can use the mask-based 512-bit blend
18544 // instructions.
18545 if (VT.getSizeInBits() == 512) {
18546 // Build a mask by testing the condition against zero.
18547 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18548 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18549 DAG.getConstant(0, dl, CondVT),
18550 ISD::SETNE);
18551 // Now return a new VSELECT using the mask.
18552 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18553 }
18554
18555 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18556 if (CondEltSize != EltSize) {
18557 // If we don't have a sign splat, rely on the expansion.
18558 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18559 return SDValue();
18560
18561 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18562 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18563 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18564 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18565 }
18566
18567 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18568 // are free to split, then better to split before expanding the
18569 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18570 // TODO: This is very similar to narrowVectorSelect.
18571 // TODO: Add Load splitting to isFreeToSplitVector ?
18572 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18573 !Subtarget.hasXOP()) {
18574 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18575 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18576 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18577 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18578 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18579 if (FreeCond && (FreeLHS || FreeRHS))
18580 return splitVectorOp(Op, DAG, dl);
18581 }
18582
18583 // Only some types will be legal on some subtargets. If we can emit a legal
18584 // VSELECT-matching blend, return Op, and but if we need to expand, return
18585 // a null value.
18586 switch (VT.SimpleTy) {
18587 default:
18588 // Most of the vector types have blends past SSE4.1.
18589 return Op;
18590
18591 case MVT::v32i8:
18592 // The byte blends for AVX vectors were introduced only in AVX2.
18593 if (Subtarget.hasAVX2())
18594 return Op;
18595
18596 return SDValue();
18597
18598 case MVT::v8i16:
18599 case MVT::v16i16:
18600 case MVT::v8f16:
18601 case MVT::v16f16: {
18602 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18603 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18604 Cond = DAG.getBitcast(CastVT, Cond);
18605 LHS = DAG.getBitcast(CastVT, LHS);
18606 RHS = DAG.getBitcast(CastVT, RHS);
18607 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18608 return DAG.getBitcast(VT, Select);
18609 }
18610 }
18611}
18612
18614 MVT VT = Op.getSimpleValueType();
18615 SDValue Vec = Op.getOperand(0);
18616 SDValue Idx = Op.getOperand(1);
18617 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18618 SDLoc dl(Op);
18619
18621 return SDValue();
18622
18623 if (VT.getSizeInBits() == 8) {
18624 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18625 // we're going to zero extend the register or fold the store.
18628 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18629 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18631
18632 unsigned IdxVal = Idx->getAsZExtVal();
18633 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18634 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18636 }
18637
18638 if (VT == MVT::f32) {
18639 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18640 // the result back to FR32 register. It's only worth matching if the
18641 // result has a single use which is a store or a bitcast to i32. And in
18642 // the case of a store, it's not worth it if the index is a constant 0,
18643 // because a MOVSSmr can be used instead, which is smaller and faster.
18644 if (!Op.hasOneUse())
18645 return SDValue();
18646 SDNode *User = *Op.getNode()->user_begin();
18647 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18648 (User->getOpcode() != ISD::BITCAST ||
18649 User->getValueType(0) != MVT::i32))
18650 return SDValue();
18651 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18652 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18653 return DAG.getBitcast(MVT::f32, Extract);
18654 }
18655
18656 if (VT == MVT::i32 || VT == MVT::i64)
18657 return Op;
18658
18659 return SDValue();
18660}
18661
18662/// Extract one bit from mask vector, like v16i1 or v8i1.
18663/// AVX-512 feature.
18665 const X86Subtarget &Subtarget) {
18666 SDValue Vec = Op.getOperand(0);
18667 SDLoc dl(Vec);
18668 MVT VecVT = Vec.getSimpleValueType();
18669 SDValue Idx = Op.getOperand(1);
18670 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18671 MVT EltVT = Op.getSimpleValueType();
18672
18673 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18674 "Unexpected vector type in ExtractBitFromMaskVector");
18675
18676 // variable index can't be handled in mask registers,
18677 // extend vector to VR512/128
18678 if (!IdxC) {
18679 unsigned NumElts = VecVT.getVectorNumElements();
18680 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18681 // than extending to 128/256bit.
18682 if (NumElts == 1) {
18683 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18685 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18686 }
18687 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18688 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18689 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18690 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18691 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18692 }
18693
18694 unsigned IdxVal = IdxC->getZExtValue();
18695 if (IdxVal == 0) // the operation is legal
18696 return Op;
18697
18698 // Extend to natively supported kshift.
18699 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18700
18701 // Use kshiftr instruction to move to the lower element.
18702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18703 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18704
18705 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18706 DAG.getVectorIdxConstant(0, dl));
18707}
18708
18709// Helper to find all the extracted elements from a vector.
18711 MVT VT = N->getSimpleValueType(0);
18712 unsigned NumElts = VT.getVectorNumElements();
18713 APInt DemandedElts = APInt::getZero(NumElts);
18714 for (SDNode *User : N->users()) {
18715 switch (User->getOpcode()) {
18716 case X86ISD::PEXTRB:
18717 case X86ISD::PEXTRW:
18720 DemandedElts.setAllBits();
18721 return DemandedElts;
18722 }
18723 DemandedElts.setBit(User->getConstantOperandVal(1));
18724 break;
18725 case ISD::BITCAST: {
18726 if (!User->getValueType(0).isSimple() ||
18727 !User->getValueType(0).isVector()) {
18728 DemandedElts.setAllBits();
18729 return DemandedElts;
18730 }
18731 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18732 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18733 break;
18734 }
18735 default:
18736 DemandedElts.setAllBits();
18737 return DemandedElts;
18738 }
18739 }
18740 return DemandedElts;
18741}
18742
18743SDValue
18744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18745 SelectionDAG &DAG) const {
18746 SDLoc dl(Op);
18747 SDValue Vec = Op.getOperand(0);
18748 MVT VecVT = Vec.getSimpleValueType();
18749 SDValue Idx = Op.getOperand(1);
18750 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18751
18752 if (VecVT.getVectorElementType() == MVT::i1)
18753 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18754
18755 if (!IdxC) {
18756 // Its more profitable to go through memory (1 cycles throughput)
18757 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18758 // IACA tool was used to get performance estimation
18759 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18760 //
18761 // example : extractelement <16 x i8> %a, i32 %i
18762 //
18763 // Block Throughput: 3.00 Cycles
18764 // Throughput Bottleneck: Port5
18765 //
18766 // | Num Of | Ports pressure in cycles | |
18767 // | Uops | 0 - DV | 5 | 6 | 7 | |
18768 // ---------------------------------------------
18769 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18770 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18771 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18772 // Total Num Of Uops: 4
18773 //
18774 //
18775 // Block Throughput: 1.00 Cycles
18776 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18777 //
18778 // | | Ports pressure in cycles | |
18779 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18780 // ---------------------------------------------------------
18781 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18782 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18783 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18784 // Total Num Of Uops: 4
18785
18786 return SDValue();
18787 }
18788
18789 unsigned IdxVal = IdxC->getZExtValue();
18790
18791 // If this is a 256-bit vector result, first extract the 128-bit vector and
18792 // then extract the element from the 128-bit vector.
18793 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18794 // Get the 128-bit vector.
18795 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18796 MVT EltVT = VecVT.getVectorElementType();
18797
18798 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18799 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18800
18801 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18802 // this can be done with a mask.
18803 IdxVal &= ElemsPerChunk - 1;
18804 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18805 DAG.getVectorIdxConstant(IdxVal, dl));
18806 }
18807
18808 assert(VecVT.is128BitVector() && "Unexpected vector length");
18809
18810 MVT VT = Op.getSimpleValueType();
18811
18812 if (VT == MVT::i16) {
18813 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18814 // we're going to zero extend the register or fold the store (SSE41 only).
18815 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18816 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18817 if (Subtarget.hasFP16())
18818 return Op;
18819
18820 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18821 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18822 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18823 }
18824
18825 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18826 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18827 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18828 }
18829
18830 if (Subtarget.hasSSE41())
18831 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18832 return Res;
18833
18834 // Only extract a single element from a v16i8 source - determine the common
18835 // DWORD/WORD that all extractions share, and extract the sub-byte.
18836 // TODO: Add QWORD MOVQ extraction?
18837 if (VT == MVT::i8) {
18838 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18839 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18840
18841 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18842 int DWordIdx = IdxVal / 4;
18843 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18844 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18845 DAG.getBitcast(MVT::v4i32, Vec),
18846 DAG.getVectorIdxConstant(DWordIdx, dl));
18847 int ShiftVal = (IdxVal % 4) * 8;
18848 if (ShiftVal != 0)
18849 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18850 DAG.getConstant(ShiftVal, dl, MVT::i8));
18851 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18852 }
18853
18854 int WordIdx = IdxVal / 2;
18855 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18856 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18857 DAG.getBitcast(MVT::v8i16, Vec),
18858 DAG.getVectorIdxConstant(WordIdx, dl));
18859 int ShiftVal = (IdxVal % 2) * 8;
18860 if (ShiftVal != 0)
18861 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18862 DAG.getConstant(ShiftVal, dl, MVT::i8));
18863 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18864 }
18865 }
18866
18867 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18868 if (IdxVal == 0)
18869 return Op;
18870
18871 // Shuffle the element to the lowest element, then movss or movsh.
18872 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18873 Mask[0] = static_cast<int>(IdxVal);
18874 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18876 DAG.getVectorIdxConstant(0, dl));
18877 }
18878
18879 if (VT.getSizeInBits() == 64) {
18880 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18881 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18882 // to match extract_elt for f64.
18883 if (IdxVal == 0)
18884 return Op;
18885
18886 // UNPCKHPD the element to the lowest double word, then movsd.
18887 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18888 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18889 int Mask[2] = { 1, -1 };
18890 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18892 DAG.getVectorIdxConstant(0, dl));
18893 }
18894
18895 return SDValue();
18896}
18897
18898/// Insert one bit to mask vector, like v16i1 or v8i1.
18899/// AVX-512 feature.
18901 const X86Subtarget &Subtarget) {
18902 SDLoc dl(Op);
18903 SDValue Vec = Op.getOperand(0);
18904 SDValue Elt = Op.getOperand(1);
18905 SDValue Idx = Op.getOperand(2);
18906 MVT VecVT = Vec.getSimpleValueType();
18907
18908 if (!isa<ConstantSDNode>(Idx)) {
18909 // Non constant index. Extend source and destination,
18910 // insert element and then truncate the result.
18911 unsigned NumElts = VecVT.getVectorNumElements();
18912 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18913 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18914 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18916 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18917 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18918 }
18919
18920 // Copy into a k-register, extract to v1i1 and insert_subvector.
18921 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18922 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18923}
18924
18925SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18926 SelectionDAG &DAG) const {
18927 MVT VT = Op.getSimpleValueType();
18928 MVT EltVT = VT.getVectorElementType();
18929 unsigned NumElts = VT.getVectorNumElements();
18930 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18931
18932 if (EltVT == MVT::i1)
18933 return InsertBitToMaskVector(Op, DAG, Subtarget);
18934
18935 SDLoc dl(Op);
18936 SDValue N0 = Op.getOperand(0);
18937 SDValue N1 = Op.getOperand(1);
18938 SDValue N2 = Op.getOperand(2);
18939 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18940
18941 if (EltVT == MVT::bf16) {
18942 MVT IVT = VT.changeVectorElementTypeToInteger();
18943 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18944 DAG.getBitcast(IVT, N0),
18945 DAG.getBitcast(MVT::i16, N1), N2);
18946 return DAG.getBitcast(VT, Res);
18947 }
18948
18949 if (!N2C) {
18950 // Variable insertion indices, usually we're better off spilling to stack,
18951 // but AVX512 can use a variable compare+select by comparing against all
18952 // possible vector indices, and FP insertion has less gpr->simd traffic.
18953 if (!(Subtarget.hasBWI() ||
18954 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18955 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18956 return SDValue();
18957
18958 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18959 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18960 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18961 return SDValue();
18962
18963 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18964 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18965 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18966
18967 SmallVector<SDValue, 16> RawIndices;
18968 for (unsigned I = 0; I != NumElts; ++I)
18969 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18970 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18971
18972 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18973 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18975 }
18976
18977 if (N2C->getAPIntValue().uge(NumElts))
18978 return SDValue();
18979 uint64_t IdxVal = N2C->getZExtValue();
18980
18981 bool IsZeroElt = X86::isZeroNode(N1);
18982 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18983
18984 if (IsZeroElt || IsAllOnesElt) {
18985 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18986 // We don't deal with i8 0 since it appears to be handled elsewhere.
18987 if (IsAllOnesElt &&
18988 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18989 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18990 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18991 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18992 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18993 CstVectorElts[IdxVal] = OnesCst;
18994 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18995 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18996 }
18997 // See if we can do this more efficiently with a blend shuffle with a
18998 // rematerializable vector.
18999 if (Subtarget.hasSSE41() &&
19000 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19001 SmallVector<int, 8> BlendMask;
19002 for (unsigned i = 0; i != NumElts; ++i)
19003 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19004 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19005 : getOnesVector(VT, DAG, dl);
19006 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19007 }
19008 }
19009
19010 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19011 // into that, and then insert the subvector back into the result.
19012 if (VT.is256BitVector() || VT.is512BitVector()) {
19013 // With a 256-bit vector, we can insert into the zero element efficiently
19014 // using a blend if we have AVX or AVX2 and the right data type.
19015 if (VT.is256BitVector() && IdxVal == 0) {
19016 // TODO: It is worthwhile to cast integer to floating point and back
19017 // and incur a domain crossing penalty if that's what we'll end up
19018 // doing anyway after extracting to a 128-bit vector.
19019 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19020 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19021 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19022 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19023 DAG.getTargetConstant(1, dl, MVT::i8));
19024 }
19025 }
19026
19027 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19028 assert(isPowerOf2_32(NumEltsIn128) &&
19029 "Vectors will always have power-of-two number of elements.");
19030
19031 // If we are not inserting into the low 128-bit vector chunk,
19032 // then prefer the broadcast+blend sequence.
19033 // FIXME: relax the profitability check iff all N1 uses are insertions.
19034 if (IdxVal >= NumEltsIn128 &&
19035 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19036 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19037 X86::mayFoldLoad(N1, Subtarget)))) {
19038 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19039 SmallVector<int, 8> BlendMask;
19040 for (unsigned i = 0; i != NumElts; ++i)
19041 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19042 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19043 }
19044
19045 // Get the desired 128-bit vector chunk.
19046 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19047
19048 // Insert the element into the desired chunk.
19049 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19050 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19051
19052 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19053 DAG.getVectorIdxConstant(IdxIn128, dl));
19054
19055 // Insert the changed part back into the bigger vector
19056 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19057 }
19058 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19059
19060 // This will be just movw/movd/movq/movsh/movss/movsd.
19061 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19062 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19063 EltVT == MVT::f16 || EltVT == MVT::i64) {
19064 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19065 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19066 }
19067
19068 // We can't directly insert an i8 or i16 into a vector, so zero extend
19069 // it to i32 first.
19070 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19071 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19072 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19073 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19074 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19075 return DAG.getBitcast(VT, N1);
19076 }
19077 }
19078
19079 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19080 // argument. SSE41 required for pinsrb.
19081 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19082 unsigned Opc;
19083 if (VT == MVT::v8i16) {
19084 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19086 } else {
19087 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19088 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19090 }
19091
19092 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19093 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19094 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19095 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19096 }
19097
19098 if (Subtarget.hasSSE41()) {
19099 if (EltVT == MVT::f32) {
19100 // Bits [7:6] of the constant are the source select. This will always be
19101 // zero here. The DAG Combiner may combine an extract_elt index into
19102 // these bits. For example (insert (extract, 3), 2) could be matched by
19103 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19104 // Bits [5:4] of the constant are the destination select. This is the
19105 // value of the incoming immediate.
19106 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19107 // combine either bitwise AND or insert of float 0.0 to set these bits.
19108
19109 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19110 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19111 // If this is an insertion of 32-bits into the low 32-bits of
19112 // a vector, we prefer to generate a blend with immediate rather
19113 // than an insertps. Blends are simpler operations in hardware and so
19114 // will always have equal or better performance than insertps.
19115 // But if optimizing for size and there's a load folding opportunity,
19116 // generate insertps because blendps does not have a 32-bit memory
19117 // operand form.
19118 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19119 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19120 DAG.getTargetConstant(1, dl, MVT::i8));
19121 }
19122 // Create this as a scalar to vector..
19123 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19124 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19125 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19126 }
19127
19128 // PINSR* works with constant index.
19129 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19130 return Op;
19131 }
19132
19133 return SDValue();
19134}
19135
19137 SelectionDAG &DAG) {
19138 SDLoc dl(Op);
19139 MVT OpVT = Op.getSimpleValueType();
19140
19141 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19142 // combines.
19143 if (X86::isZeroNode(Op.getOperand(0)))
19144 return getZeroVector(OpVT, Subtarget, DAG, dl);
19145
19146 // If this is a 256-bit vector result, first insert into a 128-bit
19147 // vector and then insert into the 256-bit vector.
19148 if (!OpVT.is128BitVector()) {
19149 // Insert into a 128-bit vector.
19150 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19152 OpVT.getVectorNumElements() / SizeFactor);
19153
19154 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19155
19156 // Insert the 128-bit vector.
19157 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19158 }
19159 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19160 "Expected an SSE type!");
19161
19162 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19163 // tblgen.
19164 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19165 return Op;
19166
19167 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19168 return DAG.getBitcast(
19169 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19170}
19171
19172// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19173// simple superregister reference or explicit instructions to insert
19174// the upper bits of a vector.
19176 SelectionDAG &DAG) {
19177 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19178
19179 return insert1BitVector(Op, DAG, Subtarget);
19180}
19181
19183 SelectionDAG &DAG) {
19184 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19185 "Only vXi1 extract_subvectors need custom lowering");
19186
19187 SDLoc dl(Op);
19188 SDValue Vec = Op.getOperand(0);
19189 uint64_t IdxVal = Op.getConstantOperandVal(1);
19190
19191 if (IdxVal == 0) // the operation is legal
19192 return Op;
19193
19194 // Extend to natively supported kshift.
19195 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19196
19197 // Shift to the LSB.
19198 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19199 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19200
19201 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19202 DAG.getVectorIdxConstant(0, dl));
19203}
19204
19205// Returns the appropriate wrapper opcode for a global reference.
19206unsigned X86TargetLowering::getGlobalWrapperKind(
19207 const GlobalValue *GV, const unsigned char OpFlags) const {
19208 // References to absolute symbols are never PC-relative.
19209 if (GV && GV->isAbsoluteSymbolRef())
19210 return X86ISD::Wrapper;
19211
19212 // The following OpFlags under RIP-rel PIC use RIP.
19213 if (Subtarget.isPICStyleRIPRel() &&
19214 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19215 OpFlags == X86II::MO_DLLIMPORT))
19216 return X86ISD::WrapperRIP;
19217
19218 // GOTPCREL references must always use RIP.
19219 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19220 return X86ISD::WrapperRIP;
19221
19222 return X86ISD::Wrapper;
19223}
19224
19225// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19226// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19227// one of the above mentioned nodes. It has to be wrapped because otherwise
19228// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19229// be used to form addressing mode. These wrapped nodes will be selected
19230// into MOV32ri.
19231SDValue
19232X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19233 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19234
19235 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19236 // global base reg.
19237 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19238
19239 auto PtrVT = getPointerTy(DAG.getDataLayout());
19241 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19242 SDLoc DL(CP);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245 // With PIC, the address is actually $g + Offset.
19246 if (OpFlag) {
19247 Result =
19248 DAG.getNode(ISD::ADD, DL, PtrVT,
19249 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19250 }
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19256 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19257
19258 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19259 // global base reg.
19260 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19261
19262 EVT PtrVT = Op.getValueType();
19263 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19264 SDLoc DL(JT);
19265 Result =
19266 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19267
19268 // With PIC, the address is actually $g + Offset.
19269 if (OpFlag)
19270 Result =
19271 DAG.getNode(ISD::ADD, DL, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19273
19274 return Result;
19275}
19276
19277SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19278 SelectionDAG &DAG) const {
19279 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19280}
19281
19282SDValue
19283X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19284 // Create the TargetBlockAddressAddress node.
19285 unsigned char OpFlags =
19286 Subtarget.classifyBlockAddressReference();
19287 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19288 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19289 SDLoc dl(Op);
19290 EVT PtrVT = Op.getValueType();
19291 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19292 Result =
19293 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19294
19295 // With PIC, the address is actually $g + Offset.
19296 if (isGlobalRelativeToPICBase(OpFlags)) {
19297 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19298 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19299 }
19300
19301 return Result;
19302}
19303
19304/// Creates target global address or external symbol nodes for calls or
19305/// other uses.
19306SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19307 bool ForCall,
19308 bool *IsImpCall) const {
19309 // Unpack the global address or external symbol.
19310 SDLoc dl(Op);
19311 const GlobalValue *GV = nullptr;
19312 int64_t Offset = 0;
19313 const char *ExternalSym = nullptr;
19314 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19315 GV = G->getGlobal();
19316 Offset = G->getOffset();
19317 } else {
19318 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19319 ExternalSym = ES->getSymbol();
19320 }
19321
19322 // Calculate some flags for address lowering.
19324 unsigned char OpFlags;
19325 if (ForCall)
19326 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19327 else
19328 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19329 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19330 bool NeedsLoad = isGlobalStubReference(OpFlags);
19331
19333 EVT PtrVT = Op.getValueType();
19335
19336 if (GV) {
19337 // Create a target global address if this is a global. If possible, fold the
19338 // offset into the global address reference. Otherwise, ADD it on later.
19339 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19340 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19341 // relocation will compute to a negative value, which is invalid.
19342 int64_t GlobalOffset = 0;
19343 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19345 std::swap(GlobalOffset, Offset);
19346 }
19347 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19348 } else {
19349 // If this is not a global address, this must be an external symbol.
19350 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19351 }
19352
19353 // If this is a direct call, avoid the wrapper if we don't need to do any
19354 // loads or adds. This allows SDAG ISel to match direct calls.
19355 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19356 return Result;
19357
19358 // If Import Call Optimization is enabled and this is an imported function
19359 // then make a note of it and return the global address without wrapping.
19360 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19361 Mod.getModuleFlag("import-call-optimization")) {
19362 assert(ForCall && "Should only enable import call optimization if we are "
19363 "lowering a call");
19364 *IsImpCall = true;
19365 return Result;
19366 }
19367
19368 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19369
19370 // With PIC, the address is actually $g + Offset.
19371 if (HasPICReg) {
19372 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19373 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19374 }
19375
19376 // For globals that require a load from a stub to get the address, emit the
19377 // load.
19378 if (NeedsLoad)
19379 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19381
19382 // If there was a non-zero offset that we didn't fold, create an explicit
19383 // addition for it.
19384 if (Offset != 0)
19385 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19386 DAG.getSignedConstant(Offset, dl, PtrVT));
19387
19388 return Result;
19389}
19390
19391SDValue
19392X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19393 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19394}
19395
19397 const EVT PtrVT, unsigned ReturnReg,
19398 unsigned char OperandFlags,
19399 bool LoadGlobalBaseReg = false,
19400 bool LocalDynamic = false) {
19402 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19403 SDLoc dl(GA);
19404 SDValue TGA;
19405 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19406 SDValue Chain = DAG.getEntryNode();
19407 SDValue Ret;
19408 if (LocalDynamic && UseTLSDESC) {
19409 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19410 // Reuse existing GetTLSADDR node if we can find it.
19411 if (TGA->hasOneUse()) {
19412 // TLSDESC uses TGA.
19413 SDNode *TLSDescOp = *TGA->user_begin();
19414 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19415 "Unexpected TLSDESC DAG");
19416 // CALLSEQ_END uses TGA via a chain and glue.
19417 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19418 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19419 "Unexpected TLSDESC DAG");
19420 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19421 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19422 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19423 "Unexpected TLSDESC DAG");
19424 Ret = SDValue(CopyFromRegOp, 0);
19425 }
19426 } else {
19427 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19428 GA->getOffset(), OperandFlags);
19429 }
19430
19431 if (!Ret) {
19432 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19433 : LocalDynamic ? X86ISD::TLSBASEADDR
19435
19436 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19437 if (LoadGlobalBaseReg) {
19438 SDValue InGlue;
19439 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19440 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19441 InGlue);
19442 InGlue = Chain.getValue(1);
19443 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19444 } else {
19445 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19446 }
19447 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19448
19449 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19450 MFI.setHasCalls(true);
19451
19452 SDValue Glue = Chain.getValue(1);
19453 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19454 }
19455
19456 if (!UseTLSDESC)
19457 return Ret;
19458
19459 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19460 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19461
19463 SDValue Offset =
19464 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19466 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19467}
19468
19469// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19470static SDValue
19472 const EVT PtrVT) {
19473 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19474 /*LoadGlobalBaseReg=*/true);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19478static SDValue
19480 const EVT PtrVT) {
19481 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19482}
19483
19484// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19485static SDValue
19487 const EVT PtrVT) {
19488 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19489}
19490
19492 SelectionDAG &DAG, const EVT PtrVT,
19493 bool Is64Bit, bool Is64BitLP64) {
19494 SDLoc dl(GA);
19495
19496 // Get the start address of the TLS block for this module.
19500
19501 SDValue Base;
19502 if (Is64Bit) {
19503 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19504 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19505 /*LoadGlobalBaseReg=*/false,
19506 /*LocalDynamic=*/true);
19507 } else {
19508 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19509 /*LoadGlobalBaseReg=*/true,
19510 /*LocalDynamic=*/true);
19511 }
19512
19513 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19514 // of Base.
19515
19516 // Build x@dtpoff.
19517 unsigned char OperandFlags = X86II::MO_DTPOFF;
19518 unsigned WrapperKind = X86ISD::Wrapper;
19519 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19520 GA->getValueType(0),
19521 GA->getOffset(), OperandFlags);
19522 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19523
19524 // Add x@dtpoff with the base.
19525 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19526}
19527
19528// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19530 const EVT PtrVT, TLSModel::Model model,
19531 bool is64Bit, bool isPIC) {
19532 SDLoc dl(GA);
19533
19534 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19537
19538 SDValue ThreadPointer =
19539 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19541
19542 unsigned char OperandFlags = 0;
19543 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19544 // initialexec.
19545 unsigned WrapperKind = X86ISD::Wrapper;
19546 if (model == TLSModel::LocalExec) {
19547 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19548 } else if (model == TLSModel::InitialExec) {
19549 if (is64Bit) {
19550 OperandFlags = X86II::MO_GOTTPOFF;
19551 WrapperKind = X86ISD::WrapperRIP;
19552 } else {
19553 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19554 }
19555 } else {
19556 llvm_unreachable("Unexpected model");
19557 }
19558
19559 // emit "addl x@ntpoff,%eax" (local exec)
19560 // or "addl x@indntpoff,%eax" (initial exec)
19561 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19562 SDValue TGA =
19563 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19564 GA->getOffset(), OperandFlags);
19565 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19566
19567 if (model == TLSModel::InitialExec) {
19568 if (isPIC && !is64Bit) {
19569 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19570 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19571 Offset);
19572 }
19573
19574 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19576 }
19577
19578 // The address of the thread local variable is the add of the thread
19579 // pointer with the offset of the variable.
19580 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19581}
19582
19583SDValue
19584X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19585
19586 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19587
19588 if (DAG.getTarget().useEmulatedTLS())
19589 return LowerToTLSEmulatedModel(GA, DAG);
19590
19591 const GlobalValue *GV = GA->getGlobal();
19592 EVT PtrVT = Op.getValueType();
19593 bool PositionIndependent = isPositionIndependent();
19594
19595 if (Subtarget.isTargetELF()) {
19596 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19597 switch (model) {
19599 if (Subtarget.is64Bit()) {
19600 if (Subtarget.isTarget64BitLP64())
19601 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19602 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19603 }
19604 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19606 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19607 Subtarget.isTarget64BitLP64());
19610 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19611 PositionIndependent);
19612 }
19613 llvm_unreachable("Unknown TLS model.");
19614 }
19615
19616 if (Subtarget.isTargetDarwin()) {
19617 // Darwin only has one model of TLS. Lower to that.
19618 unsigned char OpFlag = 0;
19619 unsigned WrapperKind = 0;
19620
19621 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19622 // global base reg.
19623 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19624 if (PIC32) {
19625 OpFlag = X86II::MO_TLVP_PIC_BASE;
19626 WrapperKind = X86ISD::Wrapper;
19627 } else {
19628 OpFlag = X86II::MO_TLVP;
19629 WrapperKind = X86ISD::WrapperRIP;
19630 }
19631 SDLoc DL(Op);
19633 GA->getValueType(0),
19634 GA->getOffset(), OpFlag);
19635 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19636
19637 // With PIC32, the address is actually $g + Offset.
19638 if (PIC32)
19639 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19640 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19641 Offset);
19642
19643 // Lowering the machine isd will make sure everything is in the right
19644 // location.
19645 SDValue Chain = DAG.getEntryNode();
19646 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19647 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19648 SDValue Args[] = { Chain, Offset };
19649 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19650 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19651
19652 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19653 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19654 MFI.setAdjustsStack(true);
19655
19656 // And our return value (tls address) is in the standard call return value
19657 // location.
19658 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19659 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19660 }
19661
19662 if (Subtarget.isOSWindows()) {
19663 // Just use the implicit TLS architecture
19664 // Need to generate something similar to:
19665 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19666 // ; from TEB
19667 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19668 // mov rcx, qword [rdx+rcx*8]
19669 // mov eax, .tls$:tlsvar
19670 // [rax+rcx] contains the address
19671 // Windows 64bit: gs:0x58
19672 // Windows 32bit: fs:__tls_array
19673
19674 SDLoc dl(GA);
19675 SDValue Chain = DAG.getEntryNode();
19676
19677 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19678 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19679 // use its literal value of 0x2C.
19681 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19683
19684 SDValue TlsArray = Subtarget.is64Bit()
19685 ? DAG.getIntPtrConstant(0x58, dl)
19686 : (Subtarget.isTargetWindowsGNU()
19687 ? DAG.getIntPtrConstant(0x2C, dl)
19688 : DAG.getExternalSymbol("_tls_array", PtrVT));
19689
19691 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19692
19693 SDValue res;
19695 res = ThreadPointer;
19696 } else {
19697 // Load the _tls_index variable
19698 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19699 if (Subtarget.is64Bit())
19700 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19701 MachinePointerInfo(), MVT::i32);
19702 else
19703 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19704
19705 const DataLayout &DL = DAG.getDataLayout();
19706 SDValue Scale =
19707 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19708 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19709
19710 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19711 }
19712
19713 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19714
19715 // Get the offset of start of .tls section
19716 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19717 GA->getValueType(0),
19719 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19720
19721 // The address of the thread local variable is the add of the thread
19722 // pointer with the offset of the variable.
19723 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19724 }
19725
19726 llvm_unreachable("TLS not implemented for this target.");
19727}
19728
19730 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19731 const TargetMachine &TM = getTargetMachine();
19732 TLSModel::Model Model = TM.getTLSModel(&GV);
19733 switch (Model) {
19736 // We can include the %fs segment register in addressing modes.
19737 return true;
19740 // These models do not result in %fs relative addresses unless
19741 // TLS descriptior are used.
19742 //
19743 // Even in the case of TLS descriptors we currently have no way to model
19744 // the difference between %fs access and the computations needed for the
19745 // offset and returning `true` for TLS-desc currently duplicates both
19746 // which is detrimental :-/
19747 return false;
19748 }
19749 }
19750 return false;
19751}
19752
19753/// Lower SRA_PARTS and friends, which return two i32 values
19754/// and take a 2 x i32 value to shift plus a shift amount.
19755/// TODO: Can this be moved to general expansion code?
19757 SDValue Lo, Hi;
19758 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19759 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19760}
19761
19762// Try to use a packed vector operation to handle i64 on 32-bit targets when
19763// AVX512DQ is enabled.
19765 SelectionDAG &DAG,
19766 const X86Subtarget &Subtarget) {
19767 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19769 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19770 Op.getOpcode() == ISD::UINT_TO_FP) &&
19771 "Unexpected opcode!");
19772 bool IsStrict = Op->isStrictFPOpcode();
19773 unsigned OpNo = IsStrict ? 1 : 0;
19774 SDValue Src = Op.getOperand(OpNo);
19775 MVT SrcVT = Src.getSimpleValueType();
19776 MVT VT = Op.getSimpleValueType();
19777
19778 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19779 (VT != MVT::f32 && VT != MVT::f64))
19780 return SDValue();
19781
19782 // Pack the i64 into a vector, do the operation and extract.
19783
19784 // Using 256-bit to ensure result is 128-bits for f32 case.
19785 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19786 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19787 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19788
19789 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19790 if (IsStrict) {
19791 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19792 {Op.getOperand(0), InVec});
19793 SDValue Chain = CvtVec.getValue(1);
19794 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19795 DAG.getVectorIdxConstant(0, dl));
19796 return DAG.getMergeValues({Value, Chain}, dl);
19797 }
19798
19799 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19800
19801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19802 DAG.getVectorIdxConstant(0, dl));
19803}
19804
19805// Try to use a packed vector operation to handle i64 on 32-bit targets.
19807 const X86Subtarget &Subtarget) {
19808 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19810 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19811 Op.getOpcode() == ISD::UINT_TO_FP) &&
19812 "Unexpected opcode!");
19813 bool IsStrict = Op->isStrictFPOpcode();
19814 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19815 MVT SrcVT = Src.getSimpleValueType();
19816 MVT VT = Op.getSimpleValueType();
19817
19818 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19819 return SDValue();
19820
19821 // Pack the i64 into a vector, do the operation and extract.
19822
19823 assert(Subtarget.hasFP16() && "Expected FP16");
19824
19825 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19826 if (IsStrict) {
19827 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19828 {Op.getOperand(0), InVec});
19829 SDValue Chain = CvtVec.getValue(1);
19830 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19831 DAG.getVectorIdxConstant(0, dl));
19832 return DAG.getMergeValues({Value, Chain}, dl);
19833 }
19834
19835 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19836
19837 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19838 DAG.getVectorIdxConstant(0, dl));
19839}
19840
19841static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19842 const X86Subtarget &Subtarget) {
19843 switch (Opcode) {
19844 case ISD::SINT_TO_FP:
19845 // TODO: Handle wider types with AVX/AVX512.
19846 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19847 return false;
19848 // CVTDQ2PS or (V)CVTDQ2PD
19849 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19850
19851 case ISD::UINT_TO_FP:
19852 // TODO: Handle wider types and i64 elements.
19853 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19854 return false;
19855 // VCVTUDQ2PS or VCVTUDQ2PD
19856 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19857
19858 default:
19859 return false;
19860 }
19861}
19862
19863/// Given a scalar cast operation that is extracted from a vector, try to
19864/// vectorize the cast op followed by extraction. This will avoid an expensive
19865/// round-trip between XMM and GPR.
19867 SelectionDAG &DAG,
19868 const X86Subtarget &Subtarget) {
19869 // TODO: This could be enhanced to handle smaller integer types by peeking
19870 // through an extend.
19871 SDValue Extract = Cast.getOperand(0);
19872 MVT DestVT = Cast.getSimpleValueType();
19873 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19874 !isa<ConstantSDNode>(Extract.getOperand(1)))
19875 return SDValue();
19876
19877 // See if we have a 128-bit vector cast op for this type of cast.
19878 SDValue VecOp = Extract.getOperand(0);
19879 MVT FromVT = VecOp.getSimpleValueType();
19880 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19881 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19882 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19883 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19884 return SDValue();
19885
19886 // If we are extracting from a non-zero element, first shuffle the source
19887 // vector to allow extracting from element zero.
19888 if (!isNullConstant(Extract.getOperand(1))) {
19889 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19890 Mask[0] = Extract.getConstantOperandVal(1);
19891 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19892 }
19893 // If the source vector is wider than 128-bits, extract the low part. Do not
19894 // create an unnecessarily wide vector cast op.
19895 if (FromVT != Vec128VT)
19896 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19897
19898 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19899 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19900 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19902 DAG.getVectorIdxConstant(0, DL));
19903}
19904
19905/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19906/// try to vectorize the cast ops. This will avoid an expensive round-trip
19907/// between XMM and GPR.
19908static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19909 SelectionDAG &DAG,
19910 const X86Subtarget &Subtarget) {
19911 // TODO: Allow FP_TO_UINT.
19912 SDValue CastToInt = CastToFP.getOperand(0);
19913 MVT VT = CastToFP.getSimpleValueType();
19914 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19915 return SDValue();
19916
19917 MVT IntVT = CastToInt.getSimpleValueType();
19918 SDValue X = CastToInt.getOperand(0);
19919 MVT SrcVT = X.getSimpleValueType();
19920 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19921 return SDValue();
19922
19923 // See if we have 128-bit vector cast instructions for this type of cast.
19924 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19925 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19926 IntVT != MVT::i32)
19927 return SDValue();
19928
19929 unsigned SrcSize = SrcVT.getSizeInBits();
19930 unsigned IntSize = IntVT.getSizeInBits();
19931 unsigned VTSize = VT.getSizeInBits();
19932 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19933 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19934 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19935
19936 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19937 unsigned ToIntOpcode =
19938 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19939 unsigned ToFPOpcode =
19940 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19941
19942 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19943 //
19944 // We are not defining the high elements (for example, zero them) because
19945 // that could nullify any performance advantage that we hoped to gain from
19946 // this vector op hack. We do not expect any adverse effects (like denorm
19947 // penalties) with cast ops.
19948 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19949 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19950 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19951 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19952 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19953}
19954
19956 SelectionDAG &DAG,
19957 const X86Subtarget &Subtarget) {
19958 bool IsStrict = Op->isStrictFPOpcode();
19959 MVT VT = Op->getSimpleValueType(0);
19960 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19961
19962 if (Subtarget.hasDQI()) {
19963 assert(!Subtarget.hasVLX() && "Unexpected features");
19964
19965 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19966 Src.getSimpleValueType() == MVT::v4i64) &&
19967 "Unsupported custom type");
19968
19969 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19970 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19971 "Unexpected VT!");
19972 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19973
19974 // Need to concat with zero vector for strict fp to avoid spurious
19975 // exceptions.
19976 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19977 : DAG.getUNDEF(MVT::v8i64);
19978 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19979 DAG.getVectorIdxConstant(0, DL));
19980 SDValue Res, Chain;
19981 if (IsStrict) {
19982 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19983 {Op->getOperand(0), Src});
19984 Chain = Res.getValue(1);
19985 } else {
19986 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19987 }
19988
19989 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19990 DAG.getVectorIdxConstant(0, DL));
19991
19992 if (IsStrict)
19993 return DAG.getMergeValues({Res, Chain}, DL);
19994 return Res;
19995 }
19996
19997 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19998 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19999 if (VT != MVT::v4f32 || IsSigned)
20000 return SDValue();
20001
20002 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20003 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20004 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20005 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20006 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20007 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20008 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20009 SmallVector<SDValue, 4> SignCvts(4);
20010 SmallVector<SDValue, 4> Chains(4);
20011 for (int i = 0; i != 4; ++i) {
20012 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20013 DAG.getVectorIdxConstant(i, DL));
20014 if (IsStrict) {
20015 SignCvts[i] =
20016 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20017 {Op.getOperand(0), Elt});
20018 Chains[i] = SignCvts[i].getValue(1);
20019 } else {
20020 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20021 }
20022 }
20023 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20024
20025 SDValue Slow, Chain;
20026 if (IsStrict) {
20027 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20028 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20029 {Chain, SignCvt, SignCvt});
20030 Chain = Slow.getValue(1);
20031 } else {
20032 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20033 }
20034
20035 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20036 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20037
20038 if (IsStrict)
20039 return DAG.getMergeValues({Cvt, Chain}, DL);
20040
20041 return Cvt;
20042}
20043
20045 SelectionDAG &DAG) {
20046 bool IsStrict = Op->isStrictFPOpcode();
20047 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20048 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20049 MVT VT = Op.getSimpleValueType();
20050 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20051
20052 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20053 if (IsStrict)
20054 return DAG.getNode(
20055 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20056 {Chain,
20057 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20058 Rnd});
20059 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20060 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20061}
20062
20063static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20064 const X86Subtarget &Subtarget) {
20065 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20066 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20067 return true;
20068 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20069 return true;
20070 }
20071 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20072 return true;
20073 if (Subtarget.useAVX512Regs()) {
20074 if (VT == MVT::v16i32)
20075 return true;
20076 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20077 return true;
20078 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20079 return true;
20080 }
20081 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20082 (VT == MVT::v2i64 || VT == MVT::v4i64))
20083 return true;
20084 return false;
20085}
20086
20087SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20088 SelectionDAG &DAG) const {
20089 bool IsStrict = Op->isStrictFPOpcode();
20090 unsigned OpNo = IsStrict ? 1 : 0;
20091 SDValue Src = Op.getOperand(OpNo);
20092 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20093 MVT SrcVT = Src.getSimpleValueType();
20094 MVT VT = Op.getSimpleValueType();
20095 SDLoc dl(Op);
20096
20097 if (isSoftF16(VT, Subtarget))
20098 return promoteXINT_TO_FP(Op, dl, DAG);
20099 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20100 return Op;
20101
20102 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20103 return LowerWin64_INT128_TO_FP(Op, DAG);
20104
20105 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20106 return Extract;
20107
20108 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20109 return R;
20110
20111 if (SrcVT.isVector()) {
20112 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20113 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20114 // source for strict FP.
20115 if (IsStrict)
20116 return DAG.getNode(
20117 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20118 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20119 DAG.getUNDEF(SrcVT))});
20120 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20121 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20122 DAG.getUNDEF(SrcVT)));
20123 }
20124 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20125 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20126
20127 return SDValue();
20128 }
20129
20130 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20131 "Unknown SINT_TO_FP to lower!");
20132
20133 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20134
20135 // These are really Legal; return the operand so the caller accepts it as
20136 // Legal.
20137 if (SrcVT == MVT::i32 && UseSSEReg)
20138 return Op;
20139 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20140 return Op;
20141
20142 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20143 return V;
20144 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20145 return V;
20146
20147 // SSE doesn't have an i16 conversion so we need to promote.
20148 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20149 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20150 if (IsStrict)
20151 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20152 {Chain, Ext});
20153
20154 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20155 }
20156
20157 if (VT == MVT::f128 || !Subtarget.hasX87())
20158 return SDValue();
20159
20160 SDValue ValueToStore = Src;
20161 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20162 // Bitcasting to f64 here allows us to do a single 64-bit store from
20163 // an SSE register, avoiding the store forwarding penalty that would come
20164 // with two 32-bit stores.
20165 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20166
20167 unsigned Size = SrcVT.getStoreSize();
20168 Align Alignment(Size);
20169 MachineFunction &MF = DAG.getMachineFunction();
20170 auto PtrVT = getPointerTy(MF.getDataLayout());
20171 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20172 MachinePointerInfo MPI =
20174 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20175 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20176 std::pair<SDValue, SDValue> Tmp =
20177 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20178
20179 if (IsStrict)
20180 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20181
20182 return Tmp.first;
20183}
20184
20185std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20186 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20187 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20188 // Build the FILD
20189 SDVTList Tys;
20190 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20191 if (useSSE)
20192 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20193 else
20194 Tys = DAG.getVTList(DstVT, MVT::Other);
20195
20196 SDValue FILDOps[] = {Chain, Pointer};
20197 SDValue Result =
20198 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20199 Alignment, MachineMemOperand::MOLoad);
20200 Chain = Result.getValue(1);
20201
20202 if (useSSE) {
20204 unsigned SSFISize = DstVT.getStoreSize();
20205 int SSFI =
20206 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20207 auto PtrVT = getPointerTy(MF.getDataLayout());
20208 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20209 Tys = DAG.getVTList(MVT::Other);
20210 SDValue FSTOps[] = {Chain, Result, StackSlot};
20213 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20214
20215 Chain =
20216 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20217 Result = DAG.getLoad(
20218 DstVT, DL, Chain, StackSlot,
20220 Chain = Result.getValue(1);
20221 }
20222
20223 return { Result, Chain };
20224}
20225
20226/// Horizontal vector math instructions may be slower than normal math with
20227/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20228/// implementation, and likely shuffle complexity of the alternate sequence.
20229static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 bool IsOptimizingSize = DAG.shouldOptForSize();
20232 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20233 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20234}
20235
20236/// 64-bit unsigned integer to double expansion.
20238 SelectionDAG &DAG,
20239 const X86Subtarget &Subtarget) {
20240 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20241 // when converting 0 when rounding toward negative infinity. Caller will
20242 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20243 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20244 // This algorithm is not obvious. Here it is what we're trying to output:
20245 /*
20246 movq %rax, %xmm0
20247 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20248 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20249 #ifdef __SSE3__
20250 haddpd %xmm0, %xmm0
20251 #else
20252 pshufd $0x4e, %xmm0, %xmm1
20253 addpd %xmm1, %xmm0
20254 #endif
20255 */
20256
20257 LLVMContext *Context = DAG.getContext();
20258
20259 // Build some magic constants.
20260 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20261 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20262 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20263 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20264
20266 CV1.push_back(
20267 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20268 APInt(64, 0x4330000000000000ULL))));
20269 CV1.push_back(
20270 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20271 APInt(64, 0x4530000000000000ULL))));
20272 Constant *C1 = ConstantVector::get(CV1);
20273 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20274
20275 // Load the 64-bit value into an XMM register.
20276 SDValue XR1 =
20277 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20278 SDValue CLod0 = DAG.getLoad(
20279 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20281 SDValue Unpck1 =
20282 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20283
20284 SDValue CLod1 = DAG.getLoad(
20285 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20287 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20288 // TODO: Are there any fast-math-flags to propagate here?
20289 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20290 SDValue Result;
20291
20292 if (Subtarget.hasSSE3() &&
20293 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20294 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20295 } else {
20296 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20297 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20298 }
20299 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20300 DAG.getVectorIdxConstant(0, dl));
20301 return Result;
20302}
20303
20304/// 32-bit unsigned integer to float expansion.
20306 SelectionDAG &DAG,
20307 const X86Subtarget &Subtarget) {
20308 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20309 // FP constant to bias correct the final result.
20310 SDValue Bias = DAG.getConstantFP(
20311 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20312
20313 // Load the 32-bit value into an XMM register.
20314 SDValue Load =
20315 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20316
20317 // Zero out the upper parts of the register.
20318 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20319
20320 // Or the load with the bias.
20321 SDValue Or = DAG.getNode(
20322 ISD::OR, dl, MVT::v2i64,
20323 DAG.getBitcast(MVT::v2i64, Load),
20324 DAG.getBitcast(MVT::v2i64,
20325 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20326 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20327 DAG.getBitcast(MVT::v2f64, Or),
20328 DAG.getVectorIdxConstant(0, dl));
20329
20330 if (Op.getNode()->isStrictFPOpcode()) {
20331 // Subtract the bias.
20332 // TODO: Are there any fast-math-flags to propagate here?
20333 SDValue Chain = Op.getOperand(0);
20334 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20335 {Chain, Or, Bias});
20336
20337 if (Op.getValueType() == Sub.getValueType())
20338 return Sub;
20339
20340 // Handle final rounding.
20341 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20342 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20343
20344 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20345 }
20346
20347 // Subtract the bias.
20348 // TODO: Are there any fast-math-flags to propagate here?
20349 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20350
20351 // Handle final rounding.
20352 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20353}
20354
20356 SelectionDAG &DAG,
20357 const X86Subtarget &Subtarget) {
20358 if (Op.getSimpleValueType() != MVT::v2f64)
20359 return SDValue();
20360
20361 bool IsStrict = Op->isStrictFPOpcode();
20362
20363 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20364 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20365
20366 if (Subtarget.hasAVX512()) {
20367 if (!Subtarget.hasVLX()) {
20368 // Let generic type legalization widen this.
20369 if (!IsStrict)
20370 return SDValue();
20371 // Otherwise pad the integer input with 0s and widen the operation.
20372 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20373 DAG.getConstant(0, DL, MVT::v2i32));
20374 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20375 {Op.getOperand(0), N0});
20376 SDValue Chain = Res.getValue(1);
20377 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20378 DAG.getVectorIdxConstant(0, DL));
20379 return DAG.getMergeValues({Res, Chain}, DL);
20380 }
20381
20382 // Legalize to v4i32 type.
20383 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20384 DAG.getUNDEF(MVT::v2i32));
20385 if (IsStrict)
20386 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20387 {Op.getOperand(0), N0});
20388 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20389 }
20390
20391 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20392 // This gives us the floating point equivalent of 2^52 + the i32 integer
20393 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20394 // point leaving just our i32 integers in double format.
20395 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20396 SDValue VBias = DAG.getConstantFP(
20397 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20398 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20399 DAG.getBitcast(MVT::v2i64, VBias));
20400 Or = DAG.getBitcast(MVT::v2f64, Or);
20401
20402 if (IsStrict)
20403 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20404 {Op.getOperand(0), Or, VBias});
20405 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20406}
20407
20409 SelectionDAG &DAG,
20410 const X86Subtarget &Subtarget) {
20411 bool IsStrict = Op->isStrictFPOpcode();
20412 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20413 MVT VecIntVT = V.getSimpleValueType();
20414 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20415 "Unsupported custom type");
20416
20417 if (Subtarget.hasAVX512()) {
20418 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20419 assert(!Subtarget.hasVLX() && "Unexpected features");
20420 MVT VT = Op->getSimpleValueType(0);
20421
20422 // v8i32->v8f64 is legal with AVX512 so just return it.
20423 if (VT == MVT::v8f64)
20424 return Op;
20425
20426 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20427 VT == MVT::v8f16) &&
20428 "Unexpected VT!");
20429 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20430 MVT WideIntVT = MVT::v16i32;
20431 if (VT == MVT::v4f64) {
20432 WideVT = MVT::v8f64;
20433 WideIntVT = MVT::v8i32;
20434 }
20435
20436 // Need to concat with zero vector for strict fp to avoid spurious
20437 // exceptions.
20438 SDValue Tmp =
20439 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20440 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20441 DAG.getVectorIdxConstant(0, DL));
20442 SDValue Res, Chain;
20443 if (IsStrict) {
20444 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20445 {Op->getOperand(0), V});
20446 Chain = Res.getValue(1);
20447 } else {
20448 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20449 }
20450
20451 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20452 DAG.getVectorIdxConstant(0, DL));
20453
20454 if (IsStrict)
20455 return DAG.getMergeValues({Res, Chain}, DL);
20456 return Res;
20457 }
20458
20459 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20460 Op->getSimpleValueType(0) == MVT::v4f64) {
20461 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20462 Constant *Bias = ConstantFP::get(
20463 *DAG.getContext(),
20464 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20465 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20466 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20467 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20468 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20469 SDValue VBias = DAG.getMemIntrinsicNode(
20470 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20473
20474 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20475 DAG.getBitcast(MVT::v4i64, VBias));
20476 Or = DAG.getBitcast(MVT::v4f64, Or);
20477
20478 if (IsStrict)
20479 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20480 {Op.getOperand(0), Or, VBias});
20481 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20482 }
20483
20484 // The algorithm is the following:
20485 // #ifdef __SSE4_1__
20486 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20487 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20488 // (uint4) 0x53000000, 0xaa);
20489 // #else
20490 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20491 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20492 // #endif
20493 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20494 // return (float4) lo + fhi;
20495
20496 bool Is128 = VecIntVT == MVT::v4i32;
20497 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20498 // If we convert to something else than the supported type, e.g., to v4f64,
20499 // abort early.
20500 if (VecFloatVT != Op->getSimpleValueType(0))
20501 return SDValue();
20502
20503 // In the #idef/#else code, we have in common:
20504 // - The vector of constants:
20505 // -- 0x4b000000
20506 // -- 0x53000000
20507 // - A shift:
20508 // -- v >> 16
20509
20510 // Create the splat vector for 0x4b000000.
20511 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20512 // Create the splat vector for 0x53000000.
20513 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20514
20515 // Create the right shift.
20516 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20517 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20518
20519 SDValue Low, High;
20520 if (Subtarget.hasSSE41()) {
20521 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20522 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20523 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20524 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20525 // Low will be bitcasted right away, so do not bother bitcasting back to its
20526 // original type.
20527 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20528 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20529 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20530 // (uint4) 0x53000000, 0xaa);
20531 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20532 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20533 // High will be bitcasted right away, so do not bother bitcasting back to
20534 // its original type.
20535 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20536 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20537 } else {
20538 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20539 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20540 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20541 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20542
20543 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20544 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20545 }
20546
20547 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20548 SDValue VecCstFSub = DAG.getConstantFP(
20549 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20550
20551 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20552 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20553 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20554 // enabled. See PR24512.
20555 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20556 // TODO: Are there any fast-math-flags to propagate here?
20557 // (float4) lo;
20558 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20559 // return (float4) lo + fhi;
20560 if (IsStrict) {
20561 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20562 {Op.getOperand(0), HighBitcast, VecCstFSub});
20563 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20564 {FHigh.getValue(1), LowBitcast, FHigh});
20565 }
20566
20567 SDValue FHigh =
20568 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20569 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20570}
20571
20573 const X86Subtarget &Subtarget) {
20574 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20575 SDValue N0 = Op.getOperand(OpNo);
20576 MVT SrcVT = N0.getSimpleValueType();
20577
20578 switch (SrcVT.SimpleTy) {
20579 default:
20580 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20581 case MVT::v2i32:
20582 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20583 case MVT::v4i32:
20584 case MVT::v8i32:
20585 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20586 case MVT::v2i64:
20587 case MVT::v4i64:
20588 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20589 }
20590}
20591
20592SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20593 SelectionDAG &DAG) const {
20594 bool IsStrict = Op->isStrictFPOpcode();
20595 unsigned OpNo = IsStrict ? 1 : 0;
20596 SDValue Src = Op.getOperand(OpNo);
20597 SDLoc dl(Op);
20598 auto PtrVT = getPointerTy(DAG.getDataLayout());
20599 MVT SrcVT = Src.getSimpleValueType();
20600 MVT DstVT = Op->getSimpleValueType(0);
20601 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20602
20603 // Bail out when we don't have native conversion instructions.
20604 if (DstVT == MVT::f128)
20605 return SDValue();
20606
20607 if (isSoftF16(DstVT, Subtarget))
20608 return promoteXINT_TO_FP(Op, dl, DAG);
20609 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20610 return Op;
20611
20612 if (DstVT.isVector())
20613 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20614
20615 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20616 return LowerWin64_INT128_TO_FP(Op, DAG);
20617
20618 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20619 return Extract;
20620
20621 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20622 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20623 // Conversions from unsigned i32 to f32/f64 are legal,
20624 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20625 return Op;
20626 }
20627
20628 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20629 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20630 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20631 if (IsStrict)
20632 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20633 {Chain, Src});
20634 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20635 }
20636
20637 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20638 return V;
20639 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20640 return V;
20641
20642 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20643 // infinity. It produces -0.0, so disable under strictfp.
20644 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20645 !IsStrict)
20646 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20647 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20648 // negative infinity. So disable under strictfp. Using FILD instead.
20649 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20650 !IsStrict)
20651 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20652 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20653 (DstVT == MVT::f32 || DstVT == MVT::f64))
20654 return SDValue();
20655
20656 // Make a 64-bit buffer, and use it to build an FILD.
20657 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20658 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20659 Align SlotAlign(8);
20660 MachinePointerInfo MPI =
20662 if (SrcVT == MVT::i32) {
20663 SDValue OffsetSlot =
20664 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20665 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20666 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20667 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20668 std::pair<SDValue, SDValue> Tmp =
20669 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20670 if (IsStrict)
20671 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20672
20673 return Tmp.first;
20674 }
20675
20676 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20677 SDValue ValueToStore = Src;
20678 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20679 // Bitcasting to f64 here allows us to do a single 64-bit store from
20680 // an SSE register, avoiding the store forwarding penalty that would come
20681 // with two 32-bit stores.
20682 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20683 }
20684 SDValue Store =
20685 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20686 // For i64 source, we need to add the appropriate power of 2 if the input
20687 // was negative. We must be careful to do the computation in x87 extended
20688 // precision, not in SSE.
20689 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20690 SDValue Ops[] = {Store, StackSlot};
20691 SDValue Fild =
20692 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20693 SlotAlign, MachineMemOperand::MOLoad);
20694 Chain = Fild.getValue(1);
20695
20696 // Check whether the sign bit is set.
20697 SDValue SignSet = DAG.getSetCC(
20698 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20699 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20700
20701 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20702 APInt FF(64, 0x5F80000000000000ULL);
20703 SDValue FudgePtr =
20704 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20705 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20706
20707 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20708 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20709 SDValue Four = DAG.getIntPtrConstant(4, dl);
20710 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20711 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20712
20713 // Load the value out, extending it from f32 to f80.
20714 SDValue Fudge = DAG.getExtLoad(
20715 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20717 CPAlignment);
20718 Chain = Fudge.getValue(1);
20719 // Extend everything to 80 bits to force it to be done on x87.
20720 // TODO: Are there any fast-math-flags to propagate here?
20721 if (IsStrict) {
20722 unsigned Opc = ISD::STRICT_FADD;
20723 // Windows needs the precision control changed to 80bits around this add.
20724 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20726
20727 SDValue Add =
20728 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20729 // STRICT_FP_ROUND can't handle equal types.
20730 if (DstVT == MVT::f80)
20731 return Add;
20732 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20733 {Add.getValue(1), Add,
20734 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20735 }
20736 unsigned Opc = ISD::FADD;
20737 // Windows needs the precision control changed to 80bits around this add.
20738 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20740
20741 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20742 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20743 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20744}
20745
20746// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20747// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20748// just return an SDValue().
20749// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20750// to i16, i32 or i64, and we lower it to a legal sequence and return the
20751// result.
20752SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20753 bool IsSigned,
20754 SDValue &Chain) const {
20755 bool IsStrict = Op->isStrictFPOpcode();
20756 SDLoc DL(Op);
20757
20758 EVT DstTy = Op.getValueType();
20759 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20760 EVT TheVT = Value.getValueType();
20761 auto PtrVT = getPointerTy(DAG.getDataLayout());
20762
20763 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20764 // f16 must be promoted before using the lowering in this routine.
20765 // fp128 does not use this lowering.
20766 return SDValue();
20767 }
20768
20769 // If using FIST to compute an unsigned i64, we'll need some fixup
20770 // to handle values above the maximum signed i64. A FIST is always
20771 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20772 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20773
20774 // FIXME: This does not generate an invalid exception if the input does not
20775 // fit in i32. PR44019
20776 if (!IsSigned && DstTy != MVT::i64) {
20777 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20778 // The low 32 bits of the fist result will have the correct uint32 result.
20779 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20780 DstTy = MVT::i64;
20781 }
20782
20783 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20784 DstTy.getSimpleVT() >= MVT::i16 &&
20785 "Unknown FP_TO_INT to lower!");
20786
20787 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20788 // stack slot.
20789 MachineFunction &MF = DAG.getMachineFunction();
20790 unsigned MemSize = DstTy.getStoreSize();
20791 int SSFI =
20792 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20793 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20794
20795 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20796
20797 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20798
20799 if (UnsignedFixup) {
20800 //
20801 // Conversion to unsigned i64 is implemented with a select,
20802 // depending on whether the source value fits in the range
20803 // of a signed i64. Let Thresh be the FP equivalent of
20804 // 0x8000000000000000ULL.
20805 //
20806 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20808 // FistSrc = (Value - FltOfs);
20809 // Fist-to-mem64 FistSrc
20810 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20811 // to XOR'ing the high 32 bits with Adjust.
20812 //
20813 // Being a power of 2, Thresh is exactly representable in all FP formats.
20814 // For X87 we'd like to use the smallest FP type for this constant, but
20815 // for DAG type consistency we have to match the FP operand type.
20816
20817 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20819 bool LosesInfo = false;
20820 if (TheVT == MVT::f64)
20821 // The rounding mode is irrelevant as the conversion should be exact.
20822 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20823 &LosesInfo);
20824 else if (TheVT == MVT::f80)
20825 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20826 APFloat::rmNearestTiesToEven, &LosesInfo);
20827
20828 assert(Status == APFloat::opOK && !LosesInfo &&
20829 "FP conversion should have been exact");
20830
20831 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20832
20833 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20834 *DAG.getContext(), TheVT);
20835 SDValue Cmp;
20836 if (IsStrict) {
20837 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20838 /*IsSignaling*/ true);
20839 Chain = Cmp.getValue(1);
20840 } else {
20841 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20842 }
20843
20844 // Our preferred lowering of
20845 //
20846 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20847 //
20848 // is
20849 //
20850 // (Value >= Thresh) << 63
20851 //
20852 // but since we can get here after LegalOperations, DAGCombine might do the
20853 // wrong thing if we create a select. So, directly create the preferred
20854 // version.
20855 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20856 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20857 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20858
20859 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20860 DAG.getConstantFP(0.0, DL, TheVT));
20861
20862 if (IsStrict) {
20863 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20864 { Chain, Value, FltOfs });
20865 Chain = Value.getValue(1);
20866 } else
20867 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20868 }
20869
20870 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20871
20872 // FIXME This causes a redundant load/store if the SSE-class value is already
20873 // in memory, such as if it is on the callstack.
20874 if (isScalarFPTypeInSSEReg(TheVT)) {
20875 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20876 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20877 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20878 SDValue Ops[] = { Chain, StackSlot };
20879
20880 unsigned FLDSize = TheVT.getStoreSize();
20881 assert(FLDSize <= MemSize && "Stack slot not big enough");
20882 MachineMemOperand *MMO = MF.getMachineMemOperand(
20883 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20884 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20885 Chain = Value.getValue(1);
20886 }
20887
20888 // Build the FP_TO_INT*_IN_MEM
20889 MachineMemOperand *MMO = MF.getMachineMemOperand(
20890 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20891 SDValue Ops[] = { Chain, Value, StackSlot };
20893 DAG.getVTList(MVT::Other),
20894 Ops, DstTy, MMO);
20895
20896 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20897 Chain = Res.getValue(1);
20898
20899 // If we need an unsigned fixup, XOR the result with adjust.
20900 if (UnsignedFixup)
20901 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20902
20903 return Res;
20904}
20905
20907 const X86Subtarget &Subtarget) {
20908 MVT VT = Op.getSimpleValueType();
20909 SDValue In = Op.getOperand(0);
20910 MVT InVT = In.getSimpleValueType();
20911 unsigned Opc = Op.getOpcode();
20912
20913 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20915 "Unexpected extension opcode");
20917 "Expected same number of elements");
20918 assert((VT.getVectorElementType() == MVT::i16 ||
20919 VT.getVectorElementType() == MVT::i32 ||
20920 VT.getVectorElementType() == MVT::i64) &&
20921 "Unexpected element type");
20922 assert((InVT.getVectorElementType() == MVT::i8 ||
20923 InVT.getVectorElementType() == MVT::i16 ||
20924 InVT.getVectorElementType() == MVT::i32) &&
20925 "Unexpected element type");
20926
20927 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20928
20929 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20930 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20931 return splitVectorIntUnary(Op, DAG, dl);
20932 }
20933
20934 if (Subtarget.hasInt256())
20935 return Op;
20936
20937 // Optimize vectors in AVX mode:
20938 //
20939 // v8i16 -> v8i32
20940 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20941 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20942 // Concat upper and lower parts.
20943 //
20944 // v4i32 -> v4i64
20945 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20946 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20947 // Concat upper and lower parts.
20948 //
20949 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20950 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20951
20952 // Short-circuit if we can determine that each 128-bit half is the same value.
20953 // Otherwise, this is difficult to match and optimize.
20954 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20955 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20956 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20957
20958 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20959 SDValue Undef = DAG.getUNDEF(InVT);
20960 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20961 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20962 OpHi = DAG.getBitcast(HalfVT, OpHi);
20963
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20965}
20966
20967// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20968static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20969 const SDLoc &dl, SelectionDAG &DAG) {
20970 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20971 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20972 DAG.getVectorIdxConstant(0, dl));
20973 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20974 DAG.getVectorIdxConstant(8, dl));
20975 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20976 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20977 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20978 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20979}
20980
20982 const X86Subtarget &Subtarget,
20983 SelectionDAG &DAG) {
20984 MVT VT = Op->getSimpleValueType(0);
20985 SDValue In = Op->getOperand(0);
20986 MVT InVT = In.getSimpleValueType();
20987 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20988 unsigned NumElts = VT.getVectorNumElements();
20989
20990 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20991 // avoids a constant pool load.
20992 if (VT.getVectorElementType() != MVT::i8) {
20993 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20994 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20995 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20996 }
20997
20998 // Extend VT if BWI is not supported.
20999 MVT ExtVT = VT;
21000 if (!Subtarget.hasBWI()) {
21001 // If v16i32 is to be avoided, we'll need to split and concatenate.
21002 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21003 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21004
21005 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21006 }
21007
21008 // Widen to 512-bits if VLX is not supported.
21009 MVT WideVT = ExtVT;
21010 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21011 NumElts *= 512 / ExtVT.getSizeInBits();
21012 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21013 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21014 DAG.getVectorIdxConstant(0, DL));
21015 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21016 }
21017
21018 SDValue One = DAG.getConstant(1, DL, WideVT);
21019 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21020
21021 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21022
21023 // Truncate if we had to extend above.
21024 if (VT != ExtVT) {
21025 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21026 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21027 }
21028
21029 // Extract back to 128/256-bit if we widened.
21030 if (WideVT != VT)
21031 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21032 DAG.getVectorIdxConstant(0, DL));
21033
21034 return SelectedVal;
21035}
21036
21038 SelectionDAG &DAG) {
21039 SDValue In = Op.getOperand(0);
21040 MVT SVT = In.getSimpleValueType();
21041 SDLoc DL(Op);
21042
21043 if (SVT.getVectorElementType() == MVT::i1)
21044 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21045
21046 assert(Subtarget.hasAVX() && "Expected AVX support");
21047 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21048}
21049
21050/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21051/// It makes use of the fact that vectors with enough leading sign/zero bits
21052/// prevent the PACKSS/PACKUS from saturating the results.
21053/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21054/// within each 128-bit lane.
21055static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21056 const SDLoc &DL, SelectionDAG &DAG,
21057 const X86Subtarget &Subtarget) {
21058 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21059 "Unexpected PACK opcode");
21060 assert(DstVT.isVector() && "VT not a vector?");
21061
21062 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21063 if (!Subtarget.hasSSE2())
21064 return SDValue();
21065
21066 EVT SrcVT = In.getValueType();
21067
21068 // No truncation required, we might get here due to recursive calls.
21069 if (SrcVT == DstVT)
21070 return In;
21071
21072 unsigned NumElems = SrcVT.getVectorNumElements();
21073 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21074 return SDValue();
21075
21076 unsigned DstSizeInBits = DstVT.getSizeInBits();
21077 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21078 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21079 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21080
21081 LLVMContext &Ctx = *DAG.getContext();
21082 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21083 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21084
21085 // Pack to the largest type possible:
21086 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21087 EVT InVT = MVT::i16, OutVT = MVT::i8;
21088 if (SrcVT.getScalarSizeInBits() > 16 &&
21089 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21090 InVT = MVT::i32;
21091 OutVT = MVT::i16;
21092 }
21093
21094 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21095 // On pre-AVX512, pack the src in both halves to help value tracking.
21096 if (SrcSizeInBits <= 128) {
21097 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21098 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21099 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21100 SDValue LHS = DAG.getBitcast(InVT, In);
21101 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21103 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21104 Res = DAG.getBitcast(PackedVT, Res);
21105 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21106 }
21107
21108 // Split lower/upper subvectors.
21109 SDValue Lo, Hi;
21110 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21111
21112 // If Hi is undef, then don't bother packing it and widen the result instead.
21113 if (Hi.isUndef()) {
21114 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21115 if (SDValue Res =
21116 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21117 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21118 }
21119
21120 unsigned SubSizeInBits = SrcSizeInBits / 2;
21121 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21122 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21123
21124 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21125 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21126 Lo = DAG.getBitcast(InVT, Lo);
21127 Hi = DAG.getBitcast(InVT, Hi);
21128 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21129 return DAG.getBitcast(DstVT, Res);
21130 }
21131
21132 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21133 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21134 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21135 Lo = DAG.getBitcast(InVT, Lo);
21136 Hi = DAG.getBitcast(InVT, Hi);
21137 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21138
21139 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21140 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21141 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21143 int Scale = 64 / OutVT.getScalarSizeInBits();
21144 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21145 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21146
21147 if (DstVT.is256BitVector())
21148 return DAG.getBitcast(DstVT, Res);
21149
21150 // If 512bit -> 128bit truncate another stage.
21151 Res = DAG.getBitcast(PackedVT, Res);
21152 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21153 }
21154
21155 // Recursively pack lower/upper subvectors, concat result and pack again.
21156 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21157
21158 if (PackedVT.is128BitVector()) {
21159 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21160 // type legalization.
21161 SDValue Res =
21162 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21163 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21164 }
21165
21166 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21167 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21168 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21169 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21170 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21171}
21172
21173/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21174/// e.g. trunc <8 x i32> X to <8 x i16> -->
21175/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21176/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21178 const X86Subtarget &Subtarget,
21179 SelectionDAG &DAG) {
21180 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21181 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21182}
21183
21184/// Truncate using inreg sign extension and X86ISD::PACKSS.
21186 const X86Subtarget &Subtarget,
21187 SelectionDAG &DAG) {
21188 EVT SrcVT = In.getValueType();
21189 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21190 DAG.getValueType(DstVT));
21191 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21192}
21193
21194/// Helper to determine if \p In truncated to \p DstVT has the necessary
21195/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21196/// possibly by converting a SRL node to SRA for sign extension.
21197static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21198 SDValue In, const SDLoc &DL,
21199 SelectionDAG &DAG,
21200 const X86Subtarget &Subtarget,
21201 const SDNodeFlags Flags = SDNodeFlags()) {
21202 // Requires SSE2.
21203 if (!Subtarget.hasSSE2())
21204 return SDValue();
21205
21206 EVT SrcVT = In.getValueType();
21207 EVT DstSVT = DstVT.getVectorElementType();
21208 EVT SrcSVT = SrcVT.getVectorElementType();
21209 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21210 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21211
21212 // Check we have a truncation suited for PACKSS/PACKUS.
21213 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21214 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21215 return SDValue();
21216
21217 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21218 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21219
21220 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21221 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21222 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21223 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21224 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21225 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21226 return SDValue();
21227
21228 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21229 // split this for packing.
21230 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21231 !isFreeToSplitVector(In, DAG) &&
21232 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21233 return SDValue();
21234
21235 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21236 if (Subtarget.hasAVX512() && NumStages > 1)
21237 return SDValue();
21238
21239 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21240 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21241
21242 // Truncate with PACKUS if we are truncating a vector with leading zero
21243 // bits that extend all the way to the packed/truncated value.
21244 // e.g. Masks, zext_in_reg, etc.
21245 // Pre-SSE41 we can only use PACKUSWB.
21246 KnownBits Known = DAG.computeKnownBits(In);
21247 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21248 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21249 PackOpcode = X86ISD::PACKUS;
21250 return In;
21251 }
21252
21253 // Truncate with PACKSS if we are truncating a vector with sign-bits
21254 // that extend all the way to the packed/truncated value.
21255 // e.g. Comparison result, sext_in_reg, etc.
21256 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21257
21258 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21259 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21260 // see through BITCASTs later on and combines/simplifications can't then use
21261 // it.
21262 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21263 !Subtarget.hasAVX512())
21264 return SDValue();
21265
21266 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21267 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21268 MinSignBits < NumSignBits) {
21269 PackOpcode = X86ISD::PACKSS;
21270 return In;
21271 }
21272
21273 // If we have a srl that only generates signbits that we will discard in
21274 // the truncation then we can use PACKSS by converting the srl to a sra.
21275 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21276 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21277 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21278 if (*ShAmt == MinSignBits) {
21279 PackOpcode = X86ISD::PACKSS;
21280 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21281 }
21282 }
21283
21284 return SDValue();
21285}
21286
21287/// This function lowers a vector truncation of 'extended sign-bits' or
21288/// 'extended zero-bits' values.
21289/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21291 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21292 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21293 MVT SrcVT = In.getSimpleValueType();
21294 MVT DstSVT = DstVT.getVectorElementType();
21295 MVT SrcSVT = SrcVT.getVectorElementType();
21296 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21297 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21298 return SDValue();
21299
21300 // If the upper half of the source is undef, then attempt to split and
21301 // only truncate the lower half.
21302 if (DstVT.getSizeInBits() >= 128) {
21303 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21304 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21305 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21306 Subtarget, DAG))
21307 return widenSubVector(Res, false, Subtarget, DAG, DL,
21308 DstVT.getSizeInBits());
21309 }
21310 }
21311
21312 unsigned PackOpcode;
21313 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21314 Subtarget, Flags))
21315 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21316
21317 return SDValue();
21318}
21319
21320/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21321/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21323 const X86Subtarget &Subtarget,
21324 SelectionDAG &DAG) {
21325 MVT SrcVT = In.getSimpleValueType();
21326 MVT DstSVT = DstVT.getVectorElementType();
21327 MVT SrcSVT = SrcVT.getVectorElementType();
21328 unsigned NumElems = DstVT.getVectorNumElements();
21329 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21330 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21331 NumElems >= 8))
21332 return SDValue();
21333
21334 // SSSE3's pshufb results in less instructions in the cases below.
21335 if (Subtarget.hasSSSE3() && NumElems == 8) {
21336 if (SrcSVT == MVT::i16)
21337 return SDValue();
21338 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21339 return SDValue();
21340 }
21341
21342 // If the upper half of the source is undef, then attempt to split and
21343 // only truncate the lower half.
21344 if (DstVT.getSizeInBits() >= 128) {
21345 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21346 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21347 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21348 return widenSubVector(Res, false, Subtarget, DAG, DL,
21349 DstVT.getSizeInBits());
21350 }
21351 }
21352
21353 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21354 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21355 // truncate 2 x v4i32 to v8i16.
21356 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21357 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21358
21359 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21360 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21361
21362 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21363 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21364 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21365 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21366 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21367 }
21368
21369 return SDValue();
21370}
21371
21373 SelectionDAG &DAG,
21374 const X86Subtarget &Subtarget) {
21375 MVT VT = Op.getSimpleValueType();
21376 SDValue In = Op.getOperand(0);
21377 MVT InVT = In.getSimpleValueType();
21378 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21379
21380 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21381 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21382 if (InVT.getScalarSizeInBits() <= 16) {
21383 if (Subtarget.hasBWI()) {
21384 // legal, will go to VPMOVB2M, VPMOVW2M
21385 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21386 // We need to shift to get the lsb into sign position.
21387 // Shift packed bytes not supported natively, bitcast to word
21388 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21389 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21390 DAG.getBitcast(ExtVT, In),
21391 DAG.getConstant(ShiftInx, DL, ExtVT));
21392 In = DAG.getBitcast(InVT, In);
21393 }
21394 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21395 In, ISD::SETGT);
21396 }
21397 // Use TESTD/Q, extended vector to packed dword/qword.
21398 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21399 "Unexpected vector type.");
21400 unsigned NumElts = InVT.getVectorNumElements();
21401 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21402 // We need to change to a wider element type that we have support for.
21403 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21404 // For 16 element vectors we extend to v16i32 unless we are explicitly
21405 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21406 // we need to split into two 8 element vectors which we can extend to v8i32,
21407 // truncate and concat the results. There's an additional complication if
21408 // the original type is v16i8. In that case we can't split the v16i8
21409 // directly, so we need to shuffle high elements to low and use
21410 // sign_extend_vector_inreg.
21411 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21412 SDValue Lo, Hi;
21413 if (InVT == MVT::v16i8) {
21414 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21415 Hi = DAG.getVectorShuffle(
21416 InVT, DL, In, In,
21417 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21418 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21419 } else {
21420 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21421 Lo = extract128BitVector(In, 0, DAG, DL);
21422 Hi = extract128BitVector(In, 8, DAG, DL);
21423 }
21424 // We're split now, just emit two truncates and a concat. The two
21425 // truncates will trigger legalization to come back to this function.
21426 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21427 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21428 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21429 }
21430 // We either have 8 elements or we're allowed to use 512-bit vectors.
21431 // If we have VLX, we want to use the narrowest vector that can get the
21432 // job done so we use vXi32.
21433 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21434 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21435 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21436 InVT = ExtVT;
21437 ShiftInx = InVT.getScalarSizeInBits() - 1;
21438 }
21439
21440 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21441 // We need to shift to get the lsb into sign position.
21442 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21443 DAG.getConstant(ShiftInx, DL, InVT));
21444 }
21445 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21446 if (Subtarget.hasDQI())
21447 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21448 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21449}
21450
21451SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21452 SDLoc DL(Op);
21453 MVT VT = Op.getSimpleValueType();
21454 SDValue In = Op.getOperand(0);
21455 MVT InVT = In.getSimpleValueType();
21457 "Invalid TRUNCATE operation");
21458
21459 // If we're called by the type legalizer, handle a few cases.
21460 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21461 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21462 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21463 VT.is128BitVector() && Subtarget.hasAVX512()) {
21464 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21465 "Unexpected subtarget!");
21466 // The default behavior is to truncate one step, concatenate, and then
21467 // truncate the remainder. We'd rather produce two 64-bit results and
21468 // concatenate those.
21469 SDValue Lo, Hi;
21470 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21471
21472 EVT LoVT, HiVT;
21473 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21474
21475 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21476 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21478 }
21479
21480 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21481 if (!Subtarget.hasAVX512() ||
21482 (InVT.is512BitVector() && VT.is256BitVector()))
21484 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21485 return SignPack;
21486
21487 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21488 if (!Subtarget.hasAVX512())
21489 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21490
21491 // Otherwise let default legalization handle it.
21492 return SDValue();
21493 }
21494
21495 if (VT.getVectorElementType() == MVT::i1)
21496 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21497
21498 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21499 // concat from subvectors to use VPTRUNC etc.
21500 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21502 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21503 return SignPack;
21504
21505 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21506 if (Subtarget.hasAVX512()) {
21507 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21508 assert(VT == MVT::v32i8 && "Unexpected VT!");
21509 return splitVectorIntUnary(Op, DAG, DL);
21510 }
21511
21512 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21513 // and then truncate that. But we should only do that if we haven't been
21514 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21515 // handled by isel patterns.
21516 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21517 Subtarget.canExtendTo512DQ())
21518 return Op;
21519 }
21520
21521 // Handle truncation of V256 to V128 using shuffles.
21522 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21523
21524 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21525 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21526 if (Subtarget.hasInt256()) {
21527 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21528 In = DAG.getBitcast(MVT::v8i32, In);
21529 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21530 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21531 DAG.getVectorIdxConstant(0, DL));
21532 }
21533
21534 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21535 DAG.getVectorIdxConstant(0, DL));
21536 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21537 DAG.getVectorIdxConstant(2, DL));
21538 static const int ShufMask[] = {0, 2, 4, 6};
21539 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21540 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21541 }
21542
21543 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21544 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21545 if (Subtarget.hasInt256()) {
21546 // The PSHUFB mask:
21547 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21548 -1, -1, -1, -1, -1, -1, -1, -1,
21549 16, 17, 20, 21, 24, 25, 28, 29,
21550 -1, -1, -1, -1, -1, -1, -1, -1 };
21551 In = DAG.getBitcast(MVT::v32i8, In);
21552 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21553 In = DAG.getBitcast(MVT::v4i64, In);
21554
21555 static const int ShufMask2[] = {0, 2, -1, -1};
21556 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21557 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21558 DAG.getVectorIdxConstant(0, DL));
21559 return DAG.getBitcast(MVT::v8i16, In);
21560 }
21561
21562 return Subtarget.hasSSE41()
21563 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21564 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21565 }
21566
21567 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21568 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21569
21570 llvm_unreachable("All 256->128 cases should have been handled above!");
21571}
21572
21573// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21574// behaves on out of range inputs to generate optimized conversions.
21576 SelectionDAG &DAG,
21577 const X86Subtarget &Subtarget) {
21578 MVT SrcVT = Src.getSimpleValueType();
21579 unsigned DstBits = VT.getScalarSizeInBits();
21580 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21581
21582 // Calculate the converted result for values in the range 0 to
21583 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21584 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21585 SDValue Big =
21586 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21587 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21588 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21589
21590 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21591 // and only if the value was out of range. So we can use that
21592 // as our indicator that we rather use "Big" instead of "Small".
21593 //
21594 // Use "Small" if "IsOverflown" has all bits cleared
21595 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21596
21597 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21598 // use the slightly slower blendv select instead.
21599 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21600 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21601 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21602 }
21603
21604 SDValue IsOverflown =
21605 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21606 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21607 return DAG.getNode(ISD::OR, dl, VT, Small,
21608 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21609}
21610
21611SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21612 bool IsStrict = Op->isStrictFPOpcode();
21613 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21614 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21615 bool HasVLX = Subtarget.hasVLX();
21616 MVT VT = Op->getSimpleValueType(0);
21617 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21618 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21619 MVT SrcVT = Src.getSimpleValueType();
21620 SDLoc dl(Op);
21621
21622 SDValue Res;
21623 if (isSoftF16(SrcVT, Subtarget)) {
21624 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21625 if (IsStrict)
21626 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21628 {NVT, MVT::Other}, {Chain, Src})});
21629 return DAG.getNode(Op.getOpcode(), dl, VT,
21630 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21631 } else if (isTypeLegal(SrcVT) &&
21632 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21633 return Op;
21634 }
21635
21636 if (VT.isVector()) {
21637 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21638 MVT ResVT = MVT::v4i32;
21639 MVT TruncVT = MVT::v4i1;
21640 unsigned Opc;
21641 if (IsStrict)
21643 else
21644 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21645
21646 if (!IsSigned && !HasVLX) {
21647 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21648 // Widen to 512-bits.
21649 ResVT = MVT::v8i32;
21650 TruncVT = MVT::v8i1;
21651 Opc = Op.getOpcode();
21652 // Need to concat with zero vector for strict fp to avoid spurious
21653 // exceptions.
21654 // TODO: Should we just do this for non-strict as well?
21655 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21656 : DAG.getUNDEF(MVT::v8f64);
21657 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21658 DAG.getVectorIdxConstant(0, dl));
21659 }
21660 if (IsStrict) {
21661 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21662 Chain = Res.getValue(1);
21663 } else {
21664 Res = DAG.getNode(Opc, dl, ResVT, Src);
21665 }
21666
21667 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21669 DAG.getVectorIdxConstant(0, dl));
21670 if (IsStrict)
21671 return DAG.getMergeValues({Res, Chain}, dl);
21672 return Res;
21673 }
21674
21675 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21676 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21677 VT == MVT::v32i16)
21678 return Op;
21679
21680 MVT ResVT = VT;
21681 MVT EleVT = VT.getVectorElementType();
21682 if (EleVT != MVT::i64)
21683 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21684
21685 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21686 SDValue Tmp =
21687 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21688 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21689 Ops[0] = Src;
21690 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21691 }
21692
21693 if (!HasVLX) {
21694 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21695 // Widen to 512-bits.
21696 unsigned IntSize = EleVT.getSizeInBits();
21697 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21698 ResVT = MVT::getVectorVT(EleVT, Num);
21699 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21700 Subtarget, DAG, dl);
21701 }
21702
21703 if (IsStrict) {
21704 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21706 dl, {ResVT, MVT::Other}, {Chain, Src});
21707 Chain = Res.getValue(1);
21708 } else {
21709 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21710 ResVT, Src);
21711 }
21712
21713 // TODO: Need to add exception check code for strict FP.
21714 if (EleVT.getSizeInBits() < 16) {
21715 if (HasVLX)
21716 ResVT = MVT::getVectorVT(EleVT, 8);
21717 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21718 }
21719
21720 if (ResVT != VT)
21721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21722 DAG.getVectorIdxConstant(0, dl));
21723
21724 if (IsStrict)
21725 return DAG.getMergeValues({Res, Chain}, dl);
21726 return Res;
21727 }
21728
21729 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21730 if (VT.getVectorElementType() == MVT::i16) {
21731 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21732 SrcVT.getVectorElementType() == MVT::f64) &&
21733 "Expected f32/f64 vector!");
21734 MVT NVT = VT.changeVectorElementType(MVT::i32);
21735 if (IsStrict) {
21736 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21738 dl, {NVT, MVT::Other}, {Chain, Src});
21739 Chain = Res.getValue(1);
21740 } else {
21741 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21742 NVT, Src);
21743 }
21744
21745 // TODO: Need to add exception check code for strict FP.
21746 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21747
21748 if (IsStrict)
21749 return DAG.getMergeValues({Res, Chain}, dl);
21750 return Res;
21751 }
21752
21753 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21754 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21755 assert(!IsSigned && "Expected unsigned conversion!");
21756 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21757 return Op;
21758 }
21759
21760 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21761 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21762 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21763 Subtarget.useAVX512Regs()) {
21764 assert(!IsSigned && "Expected unsigned conversion!");
21765 assert(!Subtarget.hasVLX() && "Unexpected features!");
21766 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21767 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21768 // Need to concat with zero vector for strict fp to avoid spurious
21769 // exceptions.
21770 // TODO: Should we just do this for non-strict as well?
21771 SDValue Tmp =
21772 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21773 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21774 DAG.getVectorIdxConstant(0, dl));
21775
21776 if (IsStrict) {
21777 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21778 {Chain, Src});
21779 Chain = Res.getValue(1);
21780 } else {
21781 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21782 }
21783
21784 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21785 DAG.getVectorIdxConstant(0, dl));
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Res, Chain}, dl);
21789 return Res;
21790 }
21791
21792 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21793 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21794 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21795 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21796 assert(!Subtarget.hasVLX() && "Unexpected features!");
21797 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21798 // Need to concat with zero vector for strict fp to avoid spurious
21799 // exceptions.
21800 // TODO: Should we just do this for non-strict as well?
21801 SDValue Tmp =
21802 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21803 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21804 DAG.getVectorIdxConstant(0, dl));
21805
21806 if (IsStrict) {
21807 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21808 {Chain, Src});
21809 Chain = Res.getValue(1);
21810 } else {
21811 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21812 }
21813
21814 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21815 DAG.getVectorIdxConstant(0, dl));
21816
21817 if (IsStrict)
21818 return DAG.getMergeValues({Res, Chain}, dl);
21819 return Res;
21820 }
21821
21822 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21823 if (!Subtarget.hasVLX()) {
21824 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21825 // legalizer and then widened again by vector op legalization.
21826 if (!IsStrict)
21827 return SDValue();
21828
21829 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21830 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21831 {Src, Zero, Zero, Zero});
21832 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21833 {Chain, Tmp});
21834 SDValue Chain = Tmp.getValue(1);
21835 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21836 DAG.getVectorIdxConstant(0, dl));
21837 return DAG.getMergeValues({Tmp, Chain}, dl);
21838 }
21839
21840 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21841 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21842 DAG.getUNDEF(MVT::v2f32));
21843 if (IsStrict) {
21844 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21846 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21847 }
21848 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21849 return DAG.getNode(Opc, dl, VT, Tmp);
21850 }
21851
21852 // Generate optimized instructions for pre AVX512 unsigned conversions from
21853 // vXf32 to vXi32.
21854 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21855 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21856 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21857 assert(!IsSigned && "Expected unsigned conversion!");
21858 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21859 }
21860
21861 return SDValue();
21862 }
21863
21864 assert(!VT.isVector());
21865
21866 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21867
21868 if (!IsSigned && UseSSEReg) {
21869 // Conversions from f32/f64 with AVX512 should be legal.
21870 if (Subtarget.hasAVX512())
21871 return Op;
21872
21873 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21874 // behaves on out of range inputs to generate optimized conversions.
21875 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21876 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21877 unsigned DstBits = VT.getScalarSizeInBits();
21878 APInt UIntLimit = APInt::getSignMask(DstBits);
21879 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21880 DAG.getConstant(UIntLimit, dl, VT));
21881 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21882
21883 // Calculate the converted result for values in the range:
21884 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21885 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21886 SDValue Small =
21887 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21888 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21889 SDValue Big = DAG.getNode(
21890 X86ISD::CVTTS2SI, dl, VT,
21891 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21892 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21893
21894 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21895 // and only if the value was out of range. So we can use that
21896 // as our indicator that we rather use "Big" instead of "Small".
21897 //
21898 // Use "Small" if "IsOverflown" has all bits cleared
21899 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21900 SDValue IsOverflown = DAG.getNode(
21901 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21902 return DAG.getNode(ISD::OR, dl, VT, Small,
21903 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21904 }
21905
21906 // Use default expansion for i64.
21907 if (VT == MVT::i64)
21908 return SDValue();
21909
21910 assert(VT == MVT::i32 && "Unexpected VT!");
21911
21912 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21913 // FIXME: This does not generate an invalid exception if the input does not
21914 // fit in i32. PR44019
21915 if (Subtarget.is64Bit()) {
21916 if (IsStrict) {
21917 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21918 {Chain, Src});
21919 Chain = Res.getValue(1);
21920 } else
21921 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21922
21923 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21924 if (IsStrict)
21925 return DAG.getMergeValues({Res, Chain}, dl);
21926 return Res;
21927 }
21928
21929 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21930 // use fisttp which will be handled later.
21931 if (!Subtarget.hasSSE3())
21932 return SDValue();
21933 }
21934
21935 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21936 // FIXME: This does not generate an invalid exception if the input does not
21937 // fit in i16. PR44019
21938 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21939 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21940 if (IsStrict) {
21941 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21942 {Chain, Src});
21943 Chain = Res.getValue(1);
21944 } else
21945 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21946
21947 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21948 if (IsStrict)
21949 return DAG.getMergeValues({Res, Chain}, dl);
21950 return Res;
21951 }
21952
21953 // If this is a FP_TO_SINT using SSEReg we're done.
21954 if (UseSSEReg && IsSigned)
21955 return Op;
21956
21957 // fp128 needs to use a libcall.
21958 if (SrcVT == MVT::f128) {
21959 RTLIB::Libcall LC;
21960 if (IsSigned)
21961 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21962 else
21963 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21964
21965 MakeLibCallOptions CallOptions;
21966 std::pair<SDValue, SDValue> Tmp =
21967 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21968
21969 if (IsStrict)
21970 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21971
21972 return Tmp.first;
21973 }
21974
21975 // Fall back to X87.
21976 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21977 if (IsStrict)
21978 return DAG.getMergeValues({V, Chain}, dl);
21979 return V;
21980 }
21981
21982 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21983}
21984
21985SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21986 SelectionDAG &DAG) const {
21987 SDValue Src = Op.getOperand(0);
21988 EVT DstVT = Op.getSimpleValueType();
21989 MVT SrcVT = Src.getSimpleValueType();
21990
21991 if (SrcVT.isVector())
21992 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21993
21994 if (SrcVT == MVT::f16)
21995 return SDValue();
21996
21997 // If the source is in an SSE register, the node is Legal.
21998 if (isScalarFPTypeInSSEReg(SrcVT))
21999 return Op;
22000
22001 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22002}
22003
22004SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22005 SelectionDAG &DAG) const {
22006 EVT DstVT = N->getValueType(0);
22007 SDValue Src = N->getOperand(0);
22008 EVT SrcVT = Src.getValueType();
22009
22010 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22011 // f16 must be promoted before using the lowering in this routine.
22012 // fp128 does not use this lowering.
22013 return SDValue();
22014 }
22015
22016 SDLoc DL(N);
22017 SDValue Chain = DAG.getEntryNode();
22018
22019 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22020
22021 // If we're converting from SSE, the stack slot needs to hold both types.
22022 // Otherwise it only needs to hold the DstVT.
22023 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22024 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22026 MachinePointerInfo MPI =
22028
22029 if (UseSSE) {
22030 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22031 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22032 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22033 SDValue Ops[] = { Chain, StackPtr };
22034
22035 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22036 /*Align*/ std::nullopt,
22038 Chain = Src.getValue(1);
22039 }
22040
22041 SDValue StoreOps[] = { Chain, Src, StackPtr };
22042 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22043 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22045
22046 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22047}
22048
22049SDValue
22050X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22051 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22052 // but making use of X86 specifics to produce better instruction sequences.
22053 SDNode *Node = Op.getNode();
22054 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22055 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22056 SDLoc dl(SDValue(Node, 0));
22057 SDValue Src = Node->getOperand(0);
22058
22059 // There are three types involved here: SrcVT is the source floating point
22060 // type, DstVT is the type of the result, and TmpVT is the result of the
22061 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22062 // DstVT).
22063 EVT SrcVT = Src.getValueType();
22064 EVT DstVT = Node->getValueType(0);
22065 EVT TmpVT = DstVT;
22066
22067 // This code is only for floats and doubles. Fall back to generic code for
22068 // anything else.
22069 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22070 return SDValue();
22071
22072 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22073 unsigned SatWidth = SatVT.getScalarSizeInBits();
22074 unsigned DstWidth = DstVT.getScalarSizeInBits();
22075 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22076 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22077 "Expected saturation width smaller than result width");
22078
22079 // Promote result of FP_TO_*INT to at least 32 bits.
22080 if (TmpWidth < 32) {
22081 TmpVT = MVT::i32;
22082 TmpWidth = 32;
22083 }
22084
22085 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22086 // us to use a native signed conversion instead.
22087 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22088 TmpVT = MVT::i64;
22089 TmpWidth = 64;
22090 }
22091
22092 // If the saturation width is smaller than the size of the temporary result,
22093 // we can always use signed conversion, which is native.
22094 if (SatWidth < TmpWidth)
22095 FpToIntOpcode = ISD::FP_TO_SINT;
22096
22097 // Determine minimum and maximum integer values and their corresponding
22098 // floating-point values.
22099 APInt MinInt, MaxInt;
22100 if (IsSigned) {
22101 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22102 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22103 } else {
22104 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22105 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22106 }
22107
22108 const fltSemantics &Sem = SrcVT.getFltSemantics();
22109 APFloat MinFloat(Sem);
22110 APFloat MaxFloat(Sem);
22111
22112 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22113 MinInt, IsSigned, APFloat::rmTowardZero);
22114 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22115 MaxInt, IsSigned, APFloat::rmTowardZero);
22116 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22117 && !(MaxStatus & APFloat::opStatus::opInexact);
22118
22119 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22120 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22121
22122 // If the integer bounds are exactly representable as floats, emit a
22123 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22124 if (AreExactFloatBounds) {
22125 if (DstVT != TmpVT) {
22126 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22127 SDValue MinClamped = DAG.getNode(
22128 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22129 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22130 SDValue BothClamped = DAG.getNode(
22131 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22132 // Convert clamped value to integer.
22133 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22134
22135 // NaN will become INDVAL, with the top bit set and the rest zero.
22136 // Truncation will discard the top bit, resulting in zero.
22137 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22138 }
22139
22140 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22141 SDValue MinClamped = DAG.getNode(
22142 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22143 // Clamp by MaxFloat from above. NaN cannot occur.
22144 SDValue BothClamped = DAG.getNode(
22145 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22146 // Convert clamped value to integer.
22147 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22148
22149 if (!IsSigned) {
22150 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22151 // which is zero.
22152 return FpToInt;
22153 }
22154
22155 // Otherwise, select zero if Src is NaN.
22156 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22157 return DAG.getSelectCC(
22158 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22159 }
22160
22161 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22162 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22163
22164 // Result of direct conversion, which may be selected away.
22165 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22166
22167 if (DstVT != TmpVT) {
22168 // NaN will become INDVAL, with the top bit set and the rest zero.
22169 // Truncation will discard the top bit, resulting in zero.
22170 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22171 }
22172
22173 SDValue Select = FpToInt;
22174 // For signed conversions where we saturate to the same size as the
22175 // result type of the fptoi instructions, INDVAL coincides with integer
22176 // minimum, so we don't need to explicitly check it.
22177 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22178 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22179 // MinInt if Src is NaN.
22180 Select = DAG.getSelectCC(
22181 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22182 }
22183
22184 // If Src OGT MaxFloat, select MaxInt.
22185 Select = DAG.getSelectCC(
22186 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22187
22188 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22189 // is already zero. The promoted case was already handled above.
22190 if (!IsSigned || DstVT != TmpVT) {
22191 return Select;
22192 }
22193
22194 // Otherwise, select 0 if Src is NaN.
22195 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22196 return DAG.getSelectCC(
22197 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22198}
22199
22200SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22201 bool IsStrict = Op->isStrictFPOpcode();
22202
22203 SDLoc DL(Op);
22204 MVT VT = Op.getSimpleValueType();
22205 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22206 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22207 MVT SVT = In.getSimpleValueType();
22208
22209 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22210 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22211 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22212 !Subtarget.getTargetTriple().isOSDarwin()))
22213 return SDValue();
22214
22215 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22216 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22217 return Op;
22218
22219 if (SVT == MVT::f16) {
22220 if (Subtarget.hasFP16())
22221 return Op;
22222
22223 if (VT != MVT::f32) {
22224 if (IsStrict)
22225 return DAG.getNode(
22226 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22227 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22228 {MVT::f32, MVT::Other}, {Chain, In})});
22229
22230 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22231 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22232 }
22233
22234 if (!Subtarget.hasF16C()) {
22235 if (!Subtarget.getTargetTriple().isOSDarwin())
22236 return SDValue();
22237
22238 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22239
22240 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22241 TargetLowering::CallLoweringInfo CLI(DAG);
22242 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22243
22244 In = DAG.getBitcast(MVT::i16, In);
22246 TargetLowering::ArgListEntry Entry(
22247 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22248 Entry.IsSExt = false;
22249 Entry.IsZExt = true;
22250 Args.push_back(Entry);
22251
22253 getLibcallName(RTLIB::FPEXT_F16_F32),
22255 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22256 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22257 std::move(Args));
22258
22259 SDValue Res;
22260 std::tie(Res,Chain) = LowerCallTo(CLI);
22261 if (IsStrict)
22262 Res = DAG.getMergeValues({Res, Chain}, DL);
22263
22264 return Res;
22265 }
22266
22267 In = DAG.getBitcast(MVT::i16, In);
22268 SDValue Res;
22269 if (IsStrict) {
22270 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22271 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22272 DAG.getVectorIdxConstant(0, DL));
22273 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22274 {Chain, In});
22275 Chain = Res.getValue(1);
22276 } else {
22277 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22278 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22279 DAG.getUNDEF(MVT::v4i32), In,
22280 DAG.getVectorIdxConstant(0, DL));
22281 In = DAG.getBitcast(MVT::v8i16, In);
22282 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22283 DAG.getTargetConstant(4, DL, MVT::i32));
22284 }
22285 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22286 DAG.getVectorIdxConstant(0, DL));
22287 if (IsStrict)
22288 return DAG.getMergeValues({Res, Chain}, DL);
22289 return Res;
22290 }
22291
22292 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22293 return Op;
22294
22295 if (SVT.getVectorElementType() == MVT::f16) {
22296 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22297 return Op;
22298 assert(Subtarget.hasF16C() && "Unexpected features!");
22299 if (SVT == MVT::v2f16)
22300 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22301 DAG.getUNDEF(MVT::v2f16));
22302 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22303 DAG.getUNDEF(MVT::v4f16));
22304 if (IsStrict)
22305 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22306 {Op->getOperand(0), Res});
22307 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22308 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22309 return Op;
22310 }
22311
22312 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22313
22314 SDValue Res =
22315 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22316 if (IsStrict)
22317 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22318 {Op->getOperand(0), Res});
22319 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22320}
22321
22322SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22323 bool IsStrict = Op->isStrictFPOpcode();
22324
22325 SDLoc DL(Op);
22326 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22327 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22328 MVT VT = Op.getSimpleValueType();
22329 MVT SVT = In.getSimpleValueType();
22330
22331 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22332 return SDValue();
22333
22334 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22335 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22336 if (!Subtarget.getTargetTriple().isOSDarwin())
22337 return SDValue();
22338
22339 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22340 TargetLowering::CallLoweringInfo CLI(DAG);
22341 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22342
22344 TargetLowering::ArgListEntry Entry(
22345 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22346 Entry.IsSExt = false;
22347 Entry.IsZExt = true;
22348 Args.push_back(Entry);
22349
22351 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22352 : RTLIB::FPROUND_F32_F16),
22354 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22355 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22356 std::move(Args));
22357
22358 SDValue Res;
22359 std::tie(Res, Chain) = LowerCallTo(CLI);
22360
22361 Res = DAG.getBitcast(MVT::f16, Res);
22362
22363 if (IsStrict)
22364 Res = DAG.getMergeValues({Res, Chain}, DL);
22365
22366 return Res;
22367 }
22368
22369 if (VT.getScalarType() == MVT::bf16) {
22370 if (SVT.getScalarType() == MVT::f32 &&
22371 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22372 Subtarget.hasAVXNECONVERT()))
22373 return Op;
22374 return SDValue();
22375 }
22376
22377 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22378 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22379 return SDValue();
22380
22381 if (VT.isVector())
22382 return Op;
22383
22384 SDValue Res;
22386 MVT::i32);
22387 if (IsStrict) {
22388 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22389 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22390 DAG.getVectorIdxConstant(0, DL));
22391 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22392 {Chain, Res, Rnd});
22393 Chain = Res.getValue(1);
22394 } else {
22395 // FIXME: Should we use zeros for upper elements for non-strict?
22396 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22397 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22398 }
22399
22400 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22401 DAG.getVectorIdxConstant(0, DL));
22402 Res = DAG.getBitcast(MVT::f16, Res);
22403
22404 if (IsStrict)
22405 return DAG.getMergeValues({Res, Chain}, DL);
22406
22407 return Res;
22408 }
22409
22410 return Op;
22411}
22412
22414 bool IsStrict = Op->isStrictFPOpcode();
22415 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22416 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22417 "Unexpected VT!");
22418
22419 SDLoc dl(Op);
22420 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22421 DAG.getConstant(0, dl, MVT::v8i16), Src,
22422 DAG.getVectorIdxConstant(0, dl));
22423
22424 SDValue Chain;
22425 if (IsStrict) {
22426 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22427 {Op.getOperand(0), Res});
22428 Chain = Res.getValue(1);
22429 } else {
22430 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22431 }
22432
22433 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22434 DAG.getVectorIdxConstant(0, dl));
22435
22436 if (IsStrict)
22437 return DAG.getMergeValues({Res, Chain}, dl);
22438
22439 return Res;
22440}
22441
22443 bool IsStrict = Op->isStrictFPOpcode();
22444 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22445 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22446 "Unexpected VT!");
22447
22448 SDLoc dl(Op);
22449 SDValue Res, Chain;
22450 if (IsStrict) {
22451 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22452 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22453 DAG.getVectorIdxConstant(0, dl));
22454 Res = DAG.getNode(
22455 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22456 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22457 Chain = Res.getValue(1);
22458 } else {
22459 // FIXME: Should we use zeros for upper elements for non-strict?
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22461 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22462 DAG.getTargetConstant(4, dl, MVT::i32));
22463 }
22464
22465 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22466 DAG.getVectorIdxConstant(0, dl));
22467
22468 if (IsStrict)
22469 return DAG.getMergeValues({Res, Chain}, dl);
22470
22471 return Res;
22472}
22473
22474SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22475 SelectionDAG &DAG) const {
22476 SDLoc DL(Op);
22477
22478 MVT SVT = Op.getOperand(0).getSimpleValueType();
22479 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22480 Subtarget.hasAVXNECONVERT())) {
22481 SDValue Res;
22482 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22483 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22484 Res = DAG.getBitcast(MVT::v8i16, Res);
22485 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22486 DAG.getVectorIdxConstant(0, DL));
22487 }
22488
22489 MakeLibCallOptions CallOptions;
22490 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22491 SDValue Res =
22492 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22493 return DAG.getBitcast(MVT::i16, Res);
22494}
22495
22496/// Depending on uarch and/or optimizing for size, we might prefer to use a
22497/// vector operation in place of the typical scalar operation.
22499 SelectionDAG &DAG,
22500 const X86Subtarget &Subtarget) {
22501 // If both operands have other uses, this is probably not profitable.
22502 SDValue LHS = Op.getOperand(0);
22503 SDValue RHS = Op.getOperand(1);
22504 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22505 return Op;
22506
22507 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22508 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22509 if (IsFP && !Subtarget.hasSSE3())
22510 return Op;
22511 if (!IsFP && !Subtarget.hasSSSE3())
22512 return Op;
22513
22514 // Extract from a common vector.
22515 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22517 LHS.getOperand(0) != RHS.getOperand(0) ||
22518 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22519 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22520 !shouldUseHorizontalOp(true, DAG, Subtarget))
22521 return Op;
22522
22523 // Allow commuted 'hadd' ops.
22524 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22525 unsigned HOpcode;
22526 switch (Op.getOpcode()) {
22527 // clang-format off
22528 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22529 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22530 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22531 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22532 default:
22533 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22534 // clang-format on
22535 }
22536 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22537 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22538 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22539 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22540 std::swap(LExtIndex, RExtIndex);
22541
22542 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22543 return Op;
22544
22545 SDValue X = LHS.getOperand(0);
22546 EVT VecVT = X.getValueType();
22547 unsigned BitWidth = VecVT.getSizeInBits();
22548 unsigned NumLanes = BitWidth / 128;
22549 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22550 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22551 "Not expecting illegal vector widths here");
22552
22553 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22554 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22555 if (BitWidth == 256 || BitWidth == 512) {
22556 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22557 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22558 LExtIndex %= NumEltsPerLane;
22559 }
22560
22561 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22563 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22564 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22565 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22567 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22568}
22569
22570/// Depending on uarch and/or optimizing for size, we might prefer to use a
22571/// vector operation in place of the typical scalar operation.
22572SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22573 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22574 "Only expecting float/double");
22575 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22576}
22577
22578/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22579/// This mode isn't supported in hardware on X86. But as long as we aren't
22580/// compiling with trapping math, we can emulate this with
22581/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22583 SDValue N0 = Op.getOperand(0);
22584 SDLoc dl(Op);
22585 MVT VT = Op.getSimpleValueType();
22586
22587 // N0 += copysign(nextafter(0.5, 0.0), N0)
22588 const fltSemantics &Sem = VT.getFltSemantics();
22589 bool Ignored;
22590 APFloat Point5Pred = APFloat(0.5f);
22591 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22592 Point5Pred.next(/*nextDown*/true);
22593
22594 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22595 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22596 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22597
22598 // Truncate the result to remove fraction.
22599 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22600}
22601
22602/// The only differences between FABS and FNEG are the mask and the logic op.
22603/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22605 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22606 "Wrong opcode for lowering FABS or FNEG.");
22607
22608 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22609
22610 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22611 // into an FNABS. We'll lower the FABS after that if it is still in use.
22612 if (IsFABS)
22613 for (SDNode *User : Op->users())
22614 if (User->getOpcode() == ISD::FNEG)
22615 return Op;
22616
22617 SDLoc dl(Op);
22618 MVT VT = Op.getSimpleValueType();
22619
22620 bool IsF128 = (VT == MVT::f128);
22621 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22623 "Unexpected type in LowerFABSorFNEG");
22624
22625 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22626 // decide if we should generate a 16-byte constant mask when we only need 4 or
22627 // 8 bytes for the scalar case.
22628
22629 // There are no scalar bitwise logical SSE/AVX instructions, so we
22630 // generate a 16-byte vector constant and logic op even for the scalar case.
22631 // Using a 16-byte mask allows folding the load of the mask with
22632 // the logic op, so it can save (~4 bytes) on code size.
22633 bool IsFakeVector = !VT.isVector() && !IsF128;
22634 MVT LogicVT = VT;
22635 if (IsFakeVector)
22636 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22637 : (VT == MVT::f32) ? MVT::v4f32
22638 : MVT::v8f16;
22639
22640 unsigned EltBits = VT.getScalarSizeInBits();
22641 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22642 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22643 APInt::getSignMask(EltBits);
22644 const fltSemantics &Sem = VT.getFltSemantics();
22645 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22646
22647 SDValue Op0 = Op.getOperand(0);
22648 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22649 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22650 IsFNABS ? X86ISD::FOR :
22652 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22653
22654 if (VT.isVector() || IsF128)
22655 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22656
22657 // For the scalar case extend to a 128-bit vector, perform the logic op,
22658 // and extract the scalar result back out.
22659 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22660 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22662 DAG.getVectorIdxConstant(0, dl));
22663}
22664
22666 SDValue Mag = Op.getOperand(0);
22667 SDValue Sign = Op.getOperand(1);
22668 SDLoc dl(Op);
22669
22670 // If the sign operand is smaller, extend it first.
22671 MVT VT = Op.getSimpleValueType();
22672 if (Sign.getSimpleValueType().bitsLT(VT))
22673 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22674
22675 // And if it is bigger, shrink it first.
22676 if (Sign.getSimpleValueType().bitsGT(VT))
22677 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22678 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22679
22680 // At this point the operands and the result should have the same
22681 // type, and that won't be f80 since that is not custom lowered.
22682 bool IsF128 = (VT == MVT::f128);
22683 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22685 "Unexpected type in LowerFCOPYSIGN");
22686
22687 const fltSemantics &Sem = VT.getFltSemantics();
22688
22689 // Perform all scalar logic operations as 16-byte vectors because there are no
22690 // scalar FP logic instructions in SSE.
22691 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22692 // unnecessary splats, but we might miss load folding opportunities. Should
22693 // this decision be based on OptimizeForSize?
22694 bool IsFakeVector = !VT.isVector() && !IsF128;
22695 MVT LogicVT = VT;
22696 if (IsFakeVector)
22697 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22698 : (VT == MVT::f32) ? MVT::v4f32
22699 : MVT::v8f16;
22700
22701 // The mask constants are automatically splatted for vector types.
22702 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22703 SDValue SignMask = DAG.getConstantFP(
22704 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22705 SDValue MagMask = DAG.getConstantFP(
22706 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22707
22708 // First, clear all bits but the sign bit from the second operand (sign).
22709 if (IsFakeVector)
22710 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22711 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22712
22713 // Next, clear the sign bit from the first operand (magnitude).
22714 // TODO: If we had general constant folding for FP logic ops, this check
22715 // wouldn't be necessary.
22716 SDValue MagBits;
22717 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22718 APFloat APF = Op0CN->getValueAPF();
22719 APF.clearSign();
22720 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22721 } else {
22722 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22723 if (IsFakeVector)
22724 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22725 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22726 }
22727
22728 // OR the magnitude value with the sign bit.
22729 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22730 return !IsFakeVector ? Or
22731 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22732 DAG.getVectorIdxConstant(0, dl));
22733}
22734
22736 SDValue N0 = Op.getOperand(0);
22737 SDLoc dl(Op);
22738 MVT VT = Op.getSimpleValueType();
22739
22740 MVT OpVT = N0.getSimpleValueType();
22741 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22742 "Unexpected type for FGETSIGN");
22743
22744 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22745 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22746 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22747 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22748 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22749 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22750 return Res;
22751}
22752
22753/// Helper for attempting to create a X86ISD::BT node.
22754static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22755 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22756 // instruction. Since the shift amount is in-range-or-undefined, we know
22757 // that doing a bittest on the i32 value is ok. We extend to i32 because
22758 // the encoding for the i16 version is larger than the i32 version.
22759 // Also promote i16 to i32 for performance / code size reason.
22760 if (Src.getValueType().getScalarSizeInBits() < 32)
22761 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22762
22763 // No legal type found, give up.
22764 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22765 return SDValue();
22766
22767 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22768 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22769 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22770 // known to be zero.
22771 if (Src.getValueType() == MVT::i64 &&
22772 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22773 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22774
22775 // If the operand types disagree, extend the shift amount to match. Since
22776 // BT ignores high bits (like shifts) we can use anyextend.
22777 if (Src.getValueType() != BitNo.getValueType()) {
22778 // Peek through a mask/modulo operation.
22779 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22780 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22781 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22782 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22783 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22784 BitNo.getOperand(0)),
22785 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22786 BitNo.getOperand(1)));
22787 else
22788 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22789 }
22790
22791 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22792}
22793
22794/// Helper for creating a X86ISD::SETCC node.
22796 SelectionDAG &DAG) {
22797 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22798 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22799}
22800
22801/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22802/// recognizable memcmp expansion.
22803static bool isOrXorXorTree(SDValue X, bool Root = true) {
22804 if (X.getOpcode() == ISD::OR)
22805 return isOrXorXorTree(X.getOperand(0), false) &&
22806 isOrXorXorTree(X.getOperand(1), false);
22807 if (Root)
22808 return false;
22809 return X.getOpcode() == ISD::XOR;
22810}
22811
22812/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22813/// expansion.
22814template <typename F>
22816 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22817 SDValue Op0 = X.getOperand(0);
22818 SDValue Op1 = X.getOperand(1);
22819 if (X.getOpcode() == ISD::OR) {
22820 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22822 if (VecVT != CmpVT)
22823 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22824 if (HasPT)
22825 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22826 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22827 }
22828 if (X.getOpcode() == ISD::XOR) {
22829 SDValue A = SToV(Op0);
22830 SDValue B = SToV(Op1);
22831 if (VecVT != CmpVT)
22832 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22833 if (HasPT)
22834 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22835 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22836 }
22837 llvm_unreachable("Impossible");
22838}
22839
22840/// Try to map a 128-bit or larger integer comparison to vector instructions
22841/// before type legalization splits it up into chunks.
22843 ISD::CondCode CC,
22844 const SDLoc &DL,
22845 SelectionDAG &DAG,
22846 const X86Subtarget &Subtarget) {
22847 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22848
22849 // We're looking for an oversized integer equality comparison.
22850 EVT OpVT = X.getValueType();
22851 unsigned OpSize = OpVT.getSizeInBits();
22852 if (!OpVT.isScalarInteger() || OpSize < 128)
22853 return SDValue();
22854
22855 // Ignore a comparison with zero because that gets special treatment in
22856 // EmitTest(). But make an exception for the special case of a pair of
22857 // logically-combined vector-sized operands compared to zero. This pattern may
22858 // be generated by the memcmp expansion pass with oversized integer compares
22859 // (see PR33325).
22860 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22861 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22862 return SDValue();
22863
22864 // Don't perform this combine if constructing the vector will be expensive.
22865 auto IsVectorBitCastCheap = [](SDValue X) {
22867 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22868 X.getOpcode() == ISD::LOAD;
22869 };
22870 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22871 !IsOrXorXorTreeCCZero)
22872 return SDValue();
22873
22874 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22875 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22876 // Otherwise use PCMPEQ (plus AND) and mask testing.
22877 bool NoImplicitFloatOps =
22879 Attribute::NoImplicitFloat);
22880 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22881 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22882 (OpSize == 256 && Subtarget.hasAVX()) ||
22883 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22884 bool HasPT = Subtarget.hasSSE41();
22885
22886 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22887 // vector registers are essentially free. (Technically, widening registers
22888 // prevents load folding, but the tradeoff is worth it.)
22889 bool PreferKOT = Subtarget.preferMaskRegisters();
22890 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22891
22892 EVT VecVT = MVT::v16i8;
22893 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22894 if (OpSize == 256) {
22895 VecVT = MVT::v32i8;
22896 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22897 }
22898 EVT CastVT = VecVT;
22899 bool NeedsAVX512FCast = false;
22900 if (OpSize == 512 || NeedZExt) {
22901 if (Subtarget.hasBWI()) {
22902 VecVT = MVT::v64i8;
22903 CmpVT = MVT::v64i1;
22904 if (OpSize == 512)
22905 CastVT = VecVT;
22906 } else {
22907 VecVT = MVT::v16i32;
22908 CmpVT = MVT::v16i1;
22909 CastVT = OpSize == 512 ? VecVT
22910 : OpSize == 256 ? MVT::v8i32
22911 : MVT::v4i32;
22912 NeedsAVX512FCast = true;
22913 }
22914 }
22915
22916 auto ScalarToVector = [&](SDValue X) -> SDValue {
22917 bool TmpZext = false;
22918 EVT TmpCastVT = CastVT;
22919 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22920 SDValue OrigX = X.getOperand(0);
22921 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22922 if (OrigSize < OpSize) {
22923 if (OrigSize == 128) {
22924 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22925 X = OrigX;
22926 TmpZext = true;
22927 } else if (OrigSize == 256) {
22928 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22929 X = OrigX;
22930 TmpZext = true;
22931 }
22932 }
22933 }
22934 X = DAG.getBitcast(TmpCastVT, X);
22935 if (!NeedZExt && !TmpZext)
22936 return X;
22937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22938 DAG.getConstant(0, DL, VecVT), X,
22939 DAG.getVectorIdxConstant(0, DL));
22940 };
22941
22942 SDValue Cmp;
22943 if (IsOrXorXorTreeCCZero) {
22944 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22945 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22946 // Use 2 vector equality compares and 'and' the results before doing a
22947 // MOVMSK.
22948 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22949 } else {
22950 SDValue VecX = ScalarToVector(X);
22951 SDValue VecY = ScalarToVector(Y);
22952 if (VecVT != CmpVT) {
22953 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22954 } else if (HasPT) {
22955 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22956 } else {
22957 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22958 }
22959 }
22960 // AVX512 should emit a setcc that will lower to kortest.
22961 if (VecVT != CmpVT) {
22962 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22963 : CmpVT == MVT::v32i1 ? MVT::i32
22964 : MVT::i16;
22965 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22966 DAG.getConstant(0, DL, KRegVT), CC);
22967 }
22968 if (HasPT) {
22969 SDValue BCCmp =
22970 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22971 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22973 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22974 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22975 }
22976 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22977 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22978 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22979 assert(Cmp.getValueType() == MVT::v16i8 &&
22980 "Non 128-bit vector on pre-SSE41 target");
22981 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22982 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22983 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22984 }
22985
22986 return SDValue();
22987}
22988
22989/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22990/// style scalarized (associative) reduction patterns. Partial reductions
22991/// are supported when the pointer SrcMask is non-null.
22992/// TODO - move this to SelectionDAG?
22995 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22997 DenseMap<SDValue, APInt> SrcOpMap;
22998 EVT VT = MVT::Other;
22999
23000 // Recognize a special case where a vector is casted into wide integer to
23001 // test all 0s.
23002 assert(Op.getOpcode() == unsigned(BinOp) &&
23003 "Unexpected bit reduction opcode");
23004 Opnds.push_back(Op.getOperand(0));
23005 Opnds.push_back(Op.getOperand(1));
23006
23007 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23009 // BFS traverse all BinOp operands.
23010 if (I->getOpcode() == unsigned(BinOp)) {
23011 Opnds.push_back(I->getOperand(0));
23012 Opnds.push_back(I->getOperand(1));
23013 // Re-evaluate the number of nodes to be traversed.
23014 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23015 continue;
23016 }
23017
23018 // Quit if a non-EXTRACT_VECTOR_ELT
23019 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23020 return false;
23021
23022 // Quit if without a constant index.
23023 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23024 if (!Idx)
23025 return false;
23026
23027 SDValue Src = I->getOperand(0);
23028 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23029 if (M == SrcOpMap.end()) {
23030 VT = Src.getValueType();
23031 // Quit if not the same type.
23032 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23033 return false;
23034 unsigned NumElts = VT.getVectorNumElements();
23035 APInt EltCount = APInt::getZero(NumElts);
23036 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23037 SrcOps.push_back(Src);
23038 }
23039
23040 // Quit if element already used.
23041 unsigned CIdx = Idx->getZExtValue();
23042 if (M->second[CIdx])
23043 return false;
23044 M->second.setBit(CIdx);
23045 }
23046
23047 if (SrcMask) {
23048 // Collect the source partial masks.
23049 for (SDValue &SrcOp : SrcOps)
23050 SrcMask->push_back(SrcOpMap[SrcOp]);
23051 } else {
23052 // Quit if not all elements are used.
23053 for (const auto &I : SrcOpMap)
23054 if (!I.second.isAllOnes())
23055 return false;
23056 }
23057
23058 return true;
23059}
23060
23061// Helper function for comparing all bits of two vectors.
23063 ISD::CondCode CC, const APInt &OriginalMask,
23064 const X86Subtarget &Subtarget,
23065 SelectionDAG &DAG, X86::CondCode &X86CC) {
23066 EVT VT = LHS.getValueType();
23067 unsigned ScalarSize = VT.getScalarSizeInBits();
23068 if (OriginalMask.getBitWidth() != ScalarSize) {
23069 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23070 return SDValue();
23071 }
23072
23073 // Quit if not convertable to legal scalar or 128/256-bit vector.
23075 return SDValue();
23076
23077 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23078 if (VT.isFloatingPoint())
23079 return SDValue();
23080
23081 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23082 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23083
23084 APInt Mask = OriginalMask;
23085
23086 auto MaskBits = [&](SDValue Src) {
23087 if (Mask.isAllOnes())
23088 return Src;
23089 EVT SrcVT = Src.getValueType();
23090 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23091 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23092 };
23093
23094 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23095 if (VT.getSizeInBits() < 128) {
23096 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23097 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23098 if (IntVT != MVT::i64)
23099 return SDValue();
23100 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23101 MVT::i32, MVT::i32);
23102 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23103 MVT::i32, MVT::i32);
23104 SDValue Lo =
23105 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23106 SDValue Hi =
23107 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23108 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23109 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23110 DAG.getConstant(0, DL, MVT::i32));
23111 }
23112 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23113 DAG.getBitcast(IntVT, MaskBits(LHS)),
23114 DAG.getBitcast(IntVT, MaskBits(RHS)));
23115 }
23116
23117 // Without PTEST, a masked v2i64 or-reduction is not faster than
23118 // scalarization.
23119 bool UseKORTEST = Subtarget.useAVX512Regs();
23120 bool UsePTEST = Subtarget.hasSSE41();
23121 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23122 return SDValue();
23123
23124 // Split down to 128/256/512-bit vector.
23125 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23126
23127 // If the input vector has vector elements wider than the target test size,
23128 // then cast to <X x i64> so it will safely split.
23129 if (ScalarSize > TestSize) {
23130 if (!Mask.isAllOnes())
23131 return SDValue();
23132 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23133 LHS = DAG.getBitcast(VT, LHS);
23134 RHS = DAG.getBitcast(VT, RHS);
23135 Mask = APInt::getAllOnes(64);
23136 }
23137
23138 if (VT.getSizeInBits() > TestSize) {
23139 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23140 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23141 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23142 while (VT.getSizeInBits() > TestSize) {
23143 auto Split = DAG.SplitVector(LHS, DL);
23144 VT = Split.first.getValueType();
23145 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23146 }
23147 RHS = DAG.getAllOnesConstant(DL, VT);
23148 } else if (!UsePTEST && !KnownRHS.isZero()) {
23149 // MOVMSK Special Case:
23150 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23151 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23152 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23153 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23154 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23155 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23156 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23157 V = DAG.getSExtOrTrunc(V, DL, VT);
23158 while (VT.getSizeInBits() > TestSize) {
23159 auto Split = DAG.SplitVector(V, DL);
23160 VT = Split.first.getValueType();
23161 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23162 }
23163 V = DAG.getNOT(DL, V, VT);
23164 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23165 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23166 DAG.getConstant(0, DL, MVT::i32));
23167 } else {
23168 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23169 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23170 while (VT.getSizeInBits() > TestSize) {
23171 auto Split = DAG.SplitVector(V, DL);
23172 VT = Split.first.getValueType();
23173 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23174 }
23175 LHS = V;
23176 RHS = DAG.getConstant(0, DL, VT);
23177 }
23178 }
23179
23180 if (UseKORTEST && VT.is512BitVector()) {
23181 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23182 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23183 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23184 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23185 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23186 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23187 }
23188
23189 if (UsePTEST) {
23190 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23191 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23192 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23193 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23194 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23195 }
23196
23197 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23198 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23199 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23200 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23201 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23202 V = DAG.getNOT(DL, V, MaskVT);
23203 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23204 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23205 DAG.getConstant(0, DL, MVT::i32));
23206}
23207
23208// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23209// to CMP(MOVMSK(PCMPEQB(X,Y))).
23211 ISD::CondCode CC, const SDLoc &DL,
23212 const X86Subtarget &Subtarget,
23213 SelectionDAG &DAG,
23214 X86::CondCode &X86CC) {
23215 SDValue Op = OrigLHS;
23216
23217 bool CmpNull;
23218 APInt Mask;
23219 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23220 CmpNull = isNullConstant(OrigRHS);
23221 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23222 return SDValue();
23223
23224 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23225 return SDValue();
23226
23227 // Check whether we're masking/truncating an OR-reduction result, in which
23228 // case track the masked bits.
23229 // TODO: Add CmpAllOnes support.
23230 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23231 if (CmpNull) {
23232 switch (Op.getOpcode()) {
23233 case ISD::TRUNCATE: {
23234 SDValue Src = Op.getOperand(0);
23235 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23236 Op.getScalarValueSizeInBits());
23237 Op = Src;
23238 break;
23239 }
23240 case ISD::AND: {
23241 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23242 Mask = Cst->getAPIntValue();
23243 Op = Op.getOperand(0);
23244 }
23245 break;
23246 }
23247 }
23248 }
23249 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23250 CC = ISD::SETEQ;
23251 CmpNull = true;
23252 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23253 } else {
23254 return SDValue();
23255 }
23256
23257 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23258
23259 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23260 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23262 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23263 EVT VT = VecIns[0].getValueType();
23264 assert(llvm::all_of(VecIns,
23265 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23266 "Reduction source vector mismatch");
23267
23268 // Quit if not splittable to scalar/128/256/512-bit vector.
23270 return SDValue();
23271
23272 // If more than one full vector is evaluated, AND/OR them first before
23273 // PTEST.
23274 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23275 Slot += 2, e += 1) {
23276 // Each iteration will AND/OR 2 nodes and append the result until there is
23277 // only 1 node left, i.e. the final value of all vectors.
23278 SDValue LHS = VecIns[Slot];
23279 SDValue RHS = VecIns[Slot + 1];
23280 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23281 }
23282
23283 return LowerVectorAllEqual(DL, VecIns.back(),
23284 CmpNull ? DAG.getConstant(0, DL, VT)
23285 : DAG.getAllOnesConstant(DL, VT),
23286 CC, Mask, Subtarget, DAG, X86CC);
23287 }
23288
23289 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23290 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23291 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23292 ISD::NodeType BinOp;
23293 if (SDValue Match =
23294 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23295 EVT MatchVT = Match.getValueType();
23296 return LowerVectorAllEqual(DL, Match,
23297 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23298 : DAG.getAllOnesConstant(DL, MatchVT),
23299 CC, Mask, Subtarget, DAG, X86CC);
23300 }
23301 }
23302
23303 if (Mask.isAllOnes()) {
23304 assert(!Op.getValueType().isVector() &&
23305 "Illegal vector type for reduction pattern");
23307 if (Src.getValueType().isFixedLengthVector() &&
23308 Src.getValueType().getScalarType() == MVT::i1) {
23309 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23310 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23311 if (Src.getOpcode() == ISD::SETCC) {
23312 SDValue LHS = Src.getOperand(0);
23313 SDValue RHS = Src.getOperand(1);
23314 EVT LHSVT = LHS.getValueType();
23315 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23316 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23318 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23319 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23320 X86CC);
23321 }
23322 }
23323 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23324 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23325 // Peek through truncation, mask the LSB and compare against zero/LSB.
23326 if (Src.getOpcode() == ISD::TRUNCATE) {
23327 SDValue Inner = Src.getOperand(0);
23328 EVT InnerVT = Inner.getValueType();
23330 unsigned BW = InnerVT.getScalarSizeInBits();
23331 APInt SrcMask = APInt(BW, 1);
23332 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23333 return LowerVectorAllEqual(DL, Inner,
23334 DAG.getConstant(Cmp, DL, InnerVT), CC,
23335 SrcMask, Subtarget, DAG, X86CC);
23336 }
23337 }
23338 }
23339 }
23340
23341 return SDValue();
23342}
23343
23344/// return true if \c Op has a use that doesn't just read flags.
23346 for (SDUse &Use : Op->uses()) {
23347 SDNode *User = Use.getUser();
23348 unsigned UOpNo = Use.getOperandNo();
23349 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23350 // Look past truncate.
23351 UOpNo = User->use_begin()->getOperandNo();
23352 User = User->use_begin()->getUser();
23353 }
23354
23355 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23356 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23357 return true;
23358 }
23359 return false;
23360}
23361
23362// Transform to an x86-specific ALU node with flags if there is a chance of
23363// using an RMW op or only the flags are used. Otherwise, leave
23364// the node alone and emit a 'cmp' or 'test' instruction.
23366 for (SDNode *U : Op->users())
23367 if (U->getOpcode() != ISD::CopyToReg &&
23368 U->getOpcode() != ISD::SETCC &&
23369 U->getOpcode() != ISD::STORE)
23370 return false;
23371
23372 return true;
23373}
23374
23375/// Emit nodes that will be selected as "test Op0,Op0", or something
23376/// equivalent.
23378 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23379 // CF and OF aren't always set the way we want. Determine which
23380 // of these we need.
23381 bool NeedCF = false;
23382 bool NeedOF = false;
23383 switch (X86CC) {
23384 default: break;
23385 case X86::COND_A: case X86::COND_AE:
23386 case X86::COND_B: case X86::COND_BE:
23387 NeedCF = true;
23388 break;
23389 case X86::COND_G: case X86::COND_GE:
23390 case X86::COND_L: case X86::COND_LE:
23391 case X86::COND_O: case X86::COND_NO: {
23392 // Check if we really need to set the
23393 // Overflow flag. If NoSignedWrap is present
23394 // that is not actually needed.
23395 switch (Op->getOpcode()) {
23396 case ISD::ADD:
23397 case ISD::SUB:
23398 case ISD::MUL:
23399 case ISD::SHL:
23400 if (Op.getNode()->getFlags().hasNoSignedWrap())
23401 break;
23402 [[fallthrough]];
23403 default:
23404 NeedOF = true;
23405 break;
23406 }
23407 break;
23408 }
23409 }
23410 // See if we can use the EFLAGS value from the operand instead of
23411 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23412 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23413 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23414 // Emit a CMP with 0, which is the TEST pattern.
23415 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23416 DAG.getConstant(0, dl, Op.getValueType()));
23417 }
23418 unsigned Opcode = 0;
23419 unsigned NumOperands = 0;
23420
23421 SDValue ArithOp = Op;
23422
23423 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23424 // which may be the result of a CAST. We use the variable 'Op', which is the
23425 // non-casted variable when we check for possible users.
23426 switch (ArithOp.getOpcode()) {
23427 case ISD::AND:
23428 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23429 // because a TEST instruction will be better.
23430 if (!hasNonFlagsUse(Op))
23431 break;
23432
23433 [[fallthrough]];
23434 case ISD::ADD:
23435 case ISD::SUB:
23436 case ISD::OR:
23437 case ISD::XOR:
23439 break;
23440
23441 // Otherwise use a regular EFLAGS-setting instruction.
23442 switch (ArithOp.getOpcode()) {
23443 // clang-format off
23444 default: llvm_unreachable("unexpected operator!");
23445 case ISD::ADD: Opcode = X86ISD::ADD; break;
23446 case ISD::SUB: Opcode = X86ISD::SUB; break;
23447 case ISD::XOR: Opcode = X86ISD::XOR; break;
23448 case ISD::AND: Opcode = X86ISD::AND; break;
23449 case ISD::OR: Opcode = X86ISD::OR; break;
23450 // clang-format on
23451 }
23452
23453 NumOperands = 2;
23454 break;
23455 case X86ISD::ADD:
23456 case X86ISD::SUB:
23457 case X86ISD::OR:
23458 case X86ISD::XOR:
23459 case X86ISD::AND:
23460 return SDValue(Op.getNode(), 1);
23461 case ISD::SSUBO:
23462 case ISD::USUBO: {
23463 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23464 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23465 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23466 Op->getOperand(1)).getValue(1);
23467 }
23468 default:
23469 break;
23470 }
23471
23472 if (Opcode == 0) {
23473 // Emit a CMP with 0, which is the TEST pattern.
23474 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23475 DAG.getConstant(0, dl, Op.getValueType()));
23476 }
23477 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23478 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23479
23480 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23481 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23482 return SDValue(New.getNode(), 1);
23483}
23484
23485/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23486/// equivalent.
23488 const SDLoc &dl, SelectionDAG &DAG,
23489 const X86Subtarget &Subtarget) {
23490 if (isNullConstant(Op1))
23491 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23492
23493 EVT CmpVT = Op0.getValueType();
23494
23495 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23496 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23497
23498 // Only promote the compare up to I32 if it is a 16 bit operation
23499 // with an immediate. 16 bit immediates are to be avoided unless the target
23500 // isn't slowed down by length changing prefixes, we're optimizing for
23501 // codesize or the comparison is with a folded load.
23502 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23503 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23505 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23506 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23507 // Don't do this if the immediate can fit in 8-bits.
23508 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23509 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23510 unsigned ExtendOp =
23512 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23513 // For equality comparisons try to use SIGN_EXTEND if the input was
23514 // truncate from something with enough sign bits.
23515 if (Op0.getOpcode() == ISD::TRUNCATE) {
23516 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23517 ExtendOp = ISD::SIGN_EXTEND;
23518 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23519 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23520 ExtendOp = ISD::SIGN_EXTEND;
23521 }
23522 }
23523
23524 CmpVT = MVT::i32;
23525 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23526 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23527 }
23528 }
23529
23530 // Try to shrink i64 compares if the input has enough zero bits.
23531 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23532 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23533 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23534 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23535 CmpVT = MVT::i32;
23536 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23537 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23538 }
23539
23540 // Try to shrink all i64 compares if the inputs are representable as signed
23541 // i32.
23542 if (CmpVT == MVT::i64 &&
23543 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23544 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23545 CmpVT = MVT::i32;
23546 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23547 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23548 }
23549
23550 // 0-x == y --> x+y == 0
23551 // 0-x != y --> x+y != 0
23552 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23553 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23556 return Add.getValue(1);
23557 }
23558
23559 // x == 0-y --> x+y == 0
23560 // x != 0-y --> x+y != 0
23561 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23562 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23563 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23564 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23565 return Add.getValue(1);
23566 }
23567
23568 // If we already have an XOR of the ops, use that to check for equality.
23569 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23570 unsigned X86Opc = X86ISD::SUB;
23571 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23572 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23573 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23574 X86Opc = X86ISD::XOR;
23575
23576 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23577 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23578 return CmpOp.getValue(1);
23579}
23580
23585
23586bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23587 SDNode *N, SDValue, SDValue IntPow2) const {
23588 if (N->getOpcode() == ISD::FDIV)
23589 return true;
23590
23591 EVT FPVT = N->getValueType(0);
23592 EVT IntVT = IntPow2.getValueType();
23593
23594 // This indicates a non-free bitcast.
23595 // TODO: This is probably overly conservative as we will need to scale the
23596 // integer vector anyways for the int->fp cast.
23597 if (FPVT.isVector() &&
23598 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23599 return false;
23600
23601 return true;
23602}
23603
23604/// Check if replacement of SQRT with RSQRT should be disabled.
23605bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23606 EVT VT = Op.getValueType();
23607
23608 // We don't need to replace SQRT with RSQRT for half type.
23609 if (VT.getScalarType() == MVT::f16)
23610 return true;
23611
23612 // We never want to use both SQRT and RSQRT instructions for the same input.
23613 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23614 return false;
23615
23616 if (VT.isVector())
23617 return Subtarget.hasFastVectorFSQRT();
23618 return Subtarget.hasFastScalarFSQRT();
23619}
23620
23621/// The minimum architected relative accuracy is 2^-12. We need one
23622/// Newton-Raphson step to have a good float result (24 bits of precision).
23623SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23624 SelectionDAG &DAG, int Enabled,
23625 int &RefinementSteps,
23626 bool &UseOneConstNR,
23627 bool Reciprocal) const {
23628 SDLoc DL(Op);
23629 EVT VT = Op.getValueType();
23630
23631 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23632 // It is likely not profitable to do this for f64 because a double-precision
23633 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23634 // instructions: convert to single, rsqrtss, convert back to double, refine
23635 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23636 // along with FMA, this could be a throughput win.
23637 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23638 // after legalize types.
23639 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23641 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23642 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23643 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23644 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23645 RefinementSteps = 1;
23646
23647 UseOneConstNR = false;
23648 // There is no FSQRT for 512-bits, but there is RSQRT14.
23649 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23650 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23651 if (RefinementSteps == 0 && !Reciprocal)
23652 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23653 return Estimate;
23654 }
23655
23656 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23657 Subtarget.hasFP16()) {
23658 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23659 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23660 RefinementSteps = 0;
23661
23662 if (VT == MVT::f16) {
23664 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23665 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23666 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23667 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23668 }
23669
23670 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23671 }
23672 return SDValue();
23673}
23674
23675/// The minimum architected relative accuracy is 2^-12. We need one
23676/// Newton-Raphson step to have a good float result (24 bits of precision).
23677SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23678 int Enabled,
23679 int &RefinementSteps) const {
23680 SDLoc DL(Op);
23681 EVT VT = Op.getValueType();
23682
23683 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23684 // It is likely not profitable to do this for f64 because a double-precision
23685 // reciprocal estimate with refinement on x86 prior to FMA requires
23686 // 15 instructions: convert to single, rcpss, convert back to double, refine
23687 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23688 // along with FMA, this could be a throughput win.
23689
23690 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23692 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23693 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23694 // Enable estimate codegen with 1 refinement step for vector division.
23695 // Scalar division estimates are disabled because they break too much
23696 // real-world code. These defaults are intended to match GCC behavior.
23697 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23698 return SDValue();
23699
23700 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23701 RefinementSteps = 1;
23702
23703 // There is no FSQRT for 512-bits, but there is RCP14.
23704 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23705 return DAG.getNode(Opcode, DL, VT, Op);
23706 }
23707
23708 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23709 Subtarget.hasFP16()) {
23710 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23711 RefinementSteps = 0;
23712
23713 if (VT == MVT::f16) {
23715 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23716 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23717 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23719 }
23720
23721 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23722 }
23723 return SDValue();
23724}
23725
23726/// If we have at least two divisions that use the same divisor, convert to
23727/// multiplication by a reciprocal. This may need to be adjusted for a given
23728/// CPU if a division's cost is not at least twice the cost of a multiplication.
23729/// This is because we still need one division to calculate the reciprocal and
23730/// then we need two multiplies by that reciprocal as replacements for the
23731/// original divisions.
23733 return 2;
23734}
23735
23736SDValue
23737X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23738 SelectionDAG &DAG,
23739 SmallVectorImpl<SDNode *> &Created) const {
23740 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23741 if (isIntDivCheap(N->getValueType(0), Attr))
23742 return SDValue(N,0); // Lower SDIV as SDIV
23743
23744 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23745 "Unexpected divisor!");
23746
23747 // Only perform this transform if CMOV is supported otherwise the select
23748 // below will become a branch.
23749 if (!Subtarget.canUseCMOV())
23750 return SDValue();
23751
23752 // fold (sdiv X, pow2)
23753 EVT VT = N->getValueType(0);
23754 // FIXME: Support i8.
23755 if (VT != MVT::i16 && VT != MVT::i32 &&
23756 !(Subtarget.is64Bit() && VT == MVT::i64))
23757 return SDValue();
23758
23759 // If the divisor is 2 or -2, the default expansion is better.
23760 if (Divisor == 2 ||
23761 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23762 return SDValue();
23763
23764 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23765}
23766
23767/// Result of 'and' is compared against zero. Change to a BT node if possible.
23768/// Returns the BT node and the condition code needed to use it.
23770 SelectionDAG &DAG, X86::CondCode &X86CC) {
23771 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23772 SDValue Op0 = And.getOperand(0);
23773 SDValue Op1 = And.getOperand(1);
23774 if (Op0.getOpcode() == ISD::TRUNCATE)
23775 Op0 = Op0.getOperand(0);
23776 if (Op1.getOpcode() == ISD::TRUNCATE)
23777 Op1 = Op1.getOperand(0);
23778
23779 SDValue Src, BitNo;
23780 if (Op1.getOpcode() == ISD::SHL)
23781 std::swap(Op0, Op1);
23782 if (Op0.getOpcode() == ISD::SHL) {
23783 if (isOneConstant(Op0.getOperand(0))) {
23784 // If we looked past a truncate, check that it's only truncating away
23785 // known zeros.
23786 unsigned BitWidth = Op0.getValueSizeInBits();
23787 unsigned AndBitWidth = And.getValueSizeInBits();
23788 if (BitWidth > AndBitWidth) {
23789 KnownBits Known = DAG.computeKnownBits(Op0);
23790 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23791 return SDValue();
23792 }
23793 Src = Op1;
23794 BitNo = Op0.getOperand(1);
23795 }
23796 } else if (Op1.getOpcode() == ISD::Constant) {
23797 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23798 uint64_t AndRHSVal = AndRHS->getZExtValue();
23799 SDValue AndLHS = Op0;
23800
23801 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23802 Src = AndLHS.getOperand(0);
23803 BitNo = AndLHS.getOperand(1);
23804 } else {
23805 // Use BT if the immediate can't be encoded in a TEST instruction or we
23806 // are optimizing for size and the immedaite won't fit in a byte.
23807 bool OptForSize = DAG.shouldOptForSize();
23808 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23809 isPowerOf2_64(AndRHSVal)) {
23810 Src = AndLHS;
23811 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23812 Src.getValueType());
23813 }
23814 }
23815 }
23816
23817 // No patterns found, give up.
23818 if (!Src.getNode())
23819 return SDValue();
23820
23821 // Remove any bit flip.
23822 if (isBitwiseNot(Src)) {
23823 Src = Src.getOperand(0);
23824 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23825 }
23826
23827 // Attempt to create the X86ISD::BT node.
23828 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23829 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23830 return BT;
23831 }
23832
23833 return SDValue();
23834}
23835
23836// Check if pre-AVX condcode can be performed by a single FCMP op.
23837static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23838 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23839}
23840
23841/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23842/// CMPs.
23843static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23844 SDValue &Op1, bool &IsAlwaysSignaling) {
23845 unsigned SSECC;
23846 bool Swap = false;
23847
23848 // SSE Condition code mapping:
23849 // 0 - EQ
23850 // 1 - LT
23851 // 2 - LE
23852 // 3 - UNORD
23853 // 4 - NEQ
23854 // 5 - NLT
23855 // 6 - NLE
23856 // 7 - ORD
23857 switch (SetCCOpcode) {
23858 // clang-format off
23859 default: llvm_unreachable("Unexpected SETCC condition");
23860 case ISD::SETOEQ:
23861 case ISD::SETEQ: SSECC = 0; break;
23862 case ISD::SETOGT:
23863 case ISD::SETGT: Swap = true; [[fallthrough]];
23864 case ISD::SETLT:
23865 case ISD::SETOLT: SSECC = 1; break;
23866 case ISD::SETOGE:
23867 case ISD::SETGE: Swap = true; [[fallthrough]];
23868 case ISD::SETLE:
23869 case ISD::SETOLE: SSECC = 2; break;
23870 case ISD::SETUO: SSECC = 3; break;
23871 case ISD::SETUNE:
23872 case ISD::SETNE: SSECC = 4; break;
23873 case ISD::SETULE: Swap = true; [[fallthrough]];
23874 case ISD::SETUGE: SSECC = 5; break;
23875 case ISD::SETULT: Swap = true; [[fallthrough]];
23876 case ISD::SETUGT: SSECC = 6; break;
23877 case ISD::SETO: SSECC = 7; break;
23878 case ISD::SETUEQ: SSECC = 8; break;
23879 case ISD::SETONE: SSECC = 12; break;
23880 // clang-format on
23881 }
23882 if (Swap)
23883 std::swap(Op0, Op1);
23884
23885 switch (SetCCOpcode) {
23886 default:
23887 IsAlwaysSignaling = true;
23888 break;
23889 case ISD::SETEQ:
23890 case ISD::SETOEQ:
23891 case ISD::SETUEQ:
23892 case ISD::SETNE:
23893 case ISD::SETONE:
23894 case ISD::SETUNE:
23895 case ISD::SETO:
23896 case ISD::SETUO:
23897 IsAlwaysSignaling = false;
23898 break;
23899 }
23900
23901 return SSECC;
23902}
23903
23904/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23905/// concatenate the result back.
23907 SelectionDAG &DAG, const SDLoc &dl) {
23908 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23909 "Unsupported VTs!");
23910 SDValue CC = DAG.getCondCode(Cond);
23911
23912 // Extract the LHS Lo/Hi vectors
23913 SDValue LHS1, LHS2;
23914 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23915
23916 // Extract the RHS Lo/Hi vectors
23917 SDValue RHS1, RHS2;
23918 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23919
23920 // Issue the operation on the smaller types and concatenate the result back
23921 EVT LoVT, HiVT;
23922 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23923 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23924 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23925 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23926}
23927
23929 SelectionDAG &DAG) {
23930 SDValue Op0 = Op.getOperand(0);
23931 SDValue Op1 = Op.getOperand(1);
23932 SDValue CC = Op.getOperand(2);
23933 MVT VT = Op.getSimpleValueType();
23934 assert(VT.getVectorElementType() == MVT::i1 &&
23935 "Cannot set masked compare for this operation");
23936
23937 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23938
23939 // Prefer SETGT over SETLT.
23940 if (SetCCOpcode == ISD::SETLT) {
23941 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23942 std::swap(Op0, Op1);
23943 }
23944
23945 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23946}
23947
23948/// Given a buildvector constant, return a new vector constant with each element
23949/// incremented or decremented. If incrementing or decrementing would result in
23950/// unsigned overflow or underflow or this is not a simple vector constant,
23951/// return an empty value.
23953 bool NSW) {
23954 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23955 if (!BV || !V.getValueType().isSimple())
23956 return SDValue();
23957
23958 MVT VT = V.getSimpleValueType();
23959 MVT EltVT = VT.getVectorElementType();
23960 unsigned NumElts = VT.getVectorNumElements();
23962 SDLoc DL(V);
23963 for (unsigned i = 0; i < NumElts; ++i) {
23964 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23965 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23966 return SDValue();
23967
23968 // Avoid overflow/underflow.
23969 const APInt &EltC = Elt->getAPIntValue();
23970 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23971 return SDValue();
23972 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23973 (!IsInc && EltC.isMinSignedValue())))
23974 return SDValue();
23975
23976 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23977 }
23978
23979 return DAG.getBuildVector(VT, DL, NewVecC);
23980}
23981
23982/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23983/// Op0 u<= Op1:
23984/// t = psubus Op0, Op1
23985/// pcmpeq t, <0..0>
23987 ISD::CondCode Cond, const SDLoc &dl,
23988 const X86Subtarget &Subtarget,
23989 SelectionDAG &DAG) {
23990 if (!Subtarget.hasSSE2())
23991 return SDValue();
23992
23993 MVT VET = VT.getVectorElementType();
23994 if (VET != MVT::i8 && VET != MVT::i16)
23995 return SDValue();
23996
23997 switch (Cond) {
23998 default:
23999 return SDValue();
24000 case ISD::SETULT: {
24001 // If the comparison is against a constant we can turn this into a
24002 // setule. With psubus, setule does not require a swap. This is
24003 // beneficial because the constant in the register is no longer
24004 // destructed as the destination so it can be hoisted out of a loop.
24005 // Only do this pre-AVX since vpcmp* is no longer destructive.
24006 if (Subtarget.hasAVX())
24007 return SDValue();
24008 SDValue ULEOp1 =
24009 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24010 if (!ULEOp1)
24011 return SDValue();
24012 Op1 = ULEOp1;
24013 break;
24014 }
24015 case ISD::SETUGT: {
24016 // If the comparison is against a constant, we can turn this into a setuge.
24017 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24018 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24019 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24020 SDValue UGEOp1 =
24021 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24022 if (!UGEOp1)
24023 return SDValue();
24024 Op1 = Op0;
24025 Op0 = UGEOp1;
24026 break;
24027 }
24028 // Psubus is better than flip-sign because it requires no inversion.
24029 case ISD::SETUGE:
24030 std::swap(Op0, Op1);
24031 break;
24032 case ISD::SETULE:
24033 break;
24034 }
24035
24036 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24037 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24038 DAG.getConstant(0, dl, VT));
24039}
24040
24041static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24042 SelectionDAG &DAG) {
24043 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24044 Op.getOpcode() == ISD::STRICT_FSETCCS;
24045 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24046 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24047 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24048 MVT VT = Op->getSimpleValueType(0);
24050 MVT OpVT = Op0.getSimpleValueType();
24051 SDLoc dl(Op);
24052
24053 if (OpVT.isFloatingPoint()) {
24054 MVT EltVT = OpVT.getVectorElementType();
24055 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24056 EltVT == MVT::f64);
24057
24058 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24059 if (isSoftF16(EltVT, Subtarget)) {
24060 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24061 return SDValue();
24062
24063 // Break 256-bit FP vector compare into smaller ones.
24064 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24065 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24066
24067 // Break 512-bit FP vector compare into smaller ones.
24068 if (OpVT.is512BitVector())
24069 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24070
24071 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24072 if (IsStrict) {
24073 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24074 {Chain, Op0});
24075 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24076 {Chain, Op1});
24077 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24078 {Chain, Op0, Op1, CC});
24079 }
24080 MVT DVT = VT.getVectorElementType() == MVT::i16
24081 ? VT.changeVectorElementType(MVT::i32)
24082 : VT;
24083 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24085 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24086 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24087 }
24088
24089 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24090
24091 // If we have a strict compare with a vXi1 result and the input is 128/256
24092 // bits we can't use a masked compare unless we have VLX. If we use a wider
24093 // compare like we do for non-strict, we might trigger spurious exceptions
24094 // from the upper elements. Instead emit a AVX compare and convert to mask.
24095 unsigned Opc;
24096 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24097 (!IsStrict || Subtarget.hasVLX() ||
24099#ifndef NDEBUG
24100 unsigned Num = VT.getVectorNumElements();
24101 assert(Num <= 16 ||
24102 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24103#endif
24104 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24105 } else {
24106 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24107 // The SSE/AVX packed FP comparison nodes are defined with a
24108 // floating-point vector result that matches the operand type. This allows
24109 // them to work with an SSE1 target (integer vector types are not legal).
24110 VT = Op0.getSimpleValueType();
24111 }
24112
24113 SDValue Cmp;
24114 bool IsAlwaysSignaling;
24115 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24116 if (!Subtarget.hasAVX()) {
24117 // TODO: We could use following steps to handle a quiet compare with
24118 // signaling encodings.
24119 // 1. Get ordered masks from a quiet ISD::SETO
24120 // 2. Use the masks to mask potential unordered elements in operand A, B
24121 // 3. Get the compare results of masked A, B
24122 // 4. Calculating final result using the mask and result from 3
24123 // But currently, we just fall back to scalar operations.
24124 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24125 return SDValue();
24126
24127 // Insert an extra signaling instruction to raise exception.
24128 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24129 SDValue SignalCmp = DAG.getNode(
24130 Opc, dl, {VT, MVT::Other},
24131 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24132 // FIXME: It seems we need to update the flags of all new strict nodes.
24133 // Otherwise, mayRaiseFPException in MI will return false due to
24134 // NoFPExcept = false by default. However, I didn't find it in other
24135 // patches.
24136 SignalCmp->setFlags(Op->getFlags());
24137 Chain = SignalCmp.getValue(1);
24138 }
24139
24140 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24141 // emit two comparisons and a logic op to tie them together.
24142 if (!cheapX86FSETCC_SSE(Cond)) {
24143 // LLVM predicate is SETUEQ or SETONE.
24144 unsigned CC0, CC1;
24145 unsigned CombineOpc;
24146 if (Cond == ISD::SETUEQ) {
24147 CC0 = 3; // UNORD
24148 CC1 = 0; // EQ
24149 CombineOpc = X86ISD::FOR;
24150 } else {
24152 CC0 = 7; // ORD
24153 CC1 = 4; // NEQ
24154 CombineOpc = X86ISD::FAND;
24155 }
24156
24157 SDValue Cmp0, Cmp1;
24158 if (IsStrict) {
24159 Cmp0 = DAG.getNode(
24160 Opc, dl, {VT, MVT::Other},
24161 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24162 Cmp1 = DAG.getNode(
24163 Opc, dl, {VT, MVT::Other},
24164 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24165 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24166 Cmp1.getValue(1));
24167 } else {
24168 Cmp0 = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24170 Cmp1 = DAG.getNode(
24171 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24172 }
24173 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24174 } else {
24175 if (IsStrict) {
24176 Cmp = DAG.getNode(
24177 Opc, dl, {VT, MVT::Other},
24178 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24179 Chain = Cmp.getValue(1);
24180 } else
24181 Cmp = DAG.getNode(
24182 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24183 }
24184 } else {
24185 // Handle all other FP comparisons here.
24186 if (IsStrict) {
24187 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24188 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24189 Cmp = DAG.getNode(
24190 Opc, dl, {VT, MVT::Other},
24191 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24192 Chain = Cmp.getValue(1);
24193 } else
24194 Cmp = DAG.getNode(
24195 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24196 }
24197
24198 if (VT.getFixedSizeInBits() >
24199 Op.getSimpleValueType().getFixedSizeInBits()) {
24200 // We emitted a compare with an XMM/YMM result. Finish converting to a
24201 // mask register using a vptestm.
24203 Cmp = DAG.getBitcast(CastVT, Cmp);
24204 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24205 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24206 } else {
24207 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24208 // the result type of SETCC. The bitcast is expected to be optimized
24209 // away during combining/isel.
24210 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24211 }
24212
24213 if (IsStrict)
24214 return DAG.getMergeValues({Cmp, Chain}, dl);
24215
24216 return Cmp;
24217 }
24218
24219 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24220
24221 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24222 assert(VTOp0 == Op1.getSimpleValueType() &&
24223 "Expected operands with same type!");
24225 "Invalid number of packed elements for source and destination!");
24226
24227 // The non-AVX512 code below works under the assumption that source and
24228 // destination types are the same.
24229 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24230 "Value types for source and destination must be the same!");
24231
24232 // The result is boolean, but operands are int/float
24233 if (VT.getVectorElementType() == MVT::i1) {
24234 // In AVX-512 architecture setcc returns mask with i1 elements,
24235 // But there is no compare instruction for i8 and i16 elements in KNL.
24236 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24237 "Unexpected operand type");
24238 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24239 }
24240
24241 // Lower using XOP integer comparisons.
24242 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24243 // Translate compare code to XOP PCOM compare mode.
24244 unsigned CmpMode = 0;
24245 switch (Cond) {
24246 // clang-format off
24247 default: llvm_unreachable("Unexpected SETCC condition");
24248 case ISD::SETULT:
24249 case ISD::SETLT: CmpMode = 0x00; break;
24250 case ISD::SETULE:
24251 case ISD::SETLE: CmpMode = 0x01; break;
24252 case ISD::SETUGT:
24253 case ISD::SETGT: CmpMode = 0x02; break;
24254 case ISD::SETUGE:
24255 case ISD::SETGE: CmpMode = 0x03; break;
24256 case ISD::SETEQ: CmpMode = 0x04; break;
24257 case ISD::SETNE: CmpMode = 0x05; break;
24258 // clang-format on
24259 }
24260
24261 // Are we comparing unsigned or signed integers?
24262 unsigned Opc =
24264
24265 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24266 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24267 }
24268
24269 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24270 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24272 SDValue BC0 = peekThroughBitcasts(Op0);
24273 if (BC0.getOpcode() == ISD::AND &&
24275 /*AllowUndefs=*/false)) {
24276 Cond = ISD::SETEQ;
24277 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24278 }
24279 }
24280
24281 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24282 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24283 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24285 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24286 unsigned BitWidth = VT.getScalarSizeInBits();
24287 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24288
24289 SDValue Result = Op0.getOperand(0);
24290 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24291 DAG.getConstant(ShiftAmt, dl, VT));
24292 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24293 DAG.getConstant(BitWidth - 1, dl, VT));
24294 return Result;
24295 }
24296 }
24297
24298 // Break 256-bit integer vector compare into smaller ones.
24299 if (VT.is256BitVector() && !Subtarget.hasInt256())
24300 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24301
24302 // Break 512-bit integer vector compare into smaller ones.
24303 // TODO: Try harder to use VPCMPx + VPMOV2x?
24304 if (VT.is512BitVector())
24305 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24306
24307 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24308 // not-of-PCMPEQ:
24309 // X != INT_MIN --> X >s INT_MIN
24310 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24311 // +X != 0 --> +X >s 0
24312 APInt ConstValue;
24313 if (Cond == ISD::SETNE &&
24314 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24315 if (ConstValue.isMinSignedValue())
24316 Cond = ISD::SETGT;
24317 else if (ConstValue.isMaxSignedValue())
24318 Cond = ISD::SETLT;
24319 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24320 Cond = ISD::SETGT;
24321 }
24322
24323 // If both operands are known non-negative, then an unsigned compare is the
24324 // same as a signed compare and there's no need to flip signbits.
24325 // TODO: We could check for more general simplifications here since we're
24326 // computing known bits.
24327 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24328 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24329
24330 // Special case: Use min/max operations for unsigned compares.
24331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24333 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24334 TLI.isOperationLegal(ISD::UMIN, VT)) {
24335 // If we have a constant operand, increment/decrement it and change the
24336 // condition to avoid an invert.
24337 if (Cond == ISD::SETUGT) {
24338 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24339 if (SDValue UGTOp1 =
24340 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24341 Op1 = UGTOp1;
24342 Cond = ISD::SETUGE;
24343 }
24344 }
24345 if (Cond == ISD::SETULT) {
24346 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24347 if (SDValue ULTOp1 =
24348 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24349 Op1 = ULTOp1;
24350 Cond = ISD::SETULE;
24351 }
24352 }
24353 bool Invert = false;
24354 unsigned Opc;
24355 switch (Cond) {
24356 // clang-format off
24357 default: llvm_unreachable("Unexpected condition code");
24358 case ISD::SETUGT: Invert = true; [[fallthrough]];
24359 case ISD::SETULE: Opc = ISD::UMIN; break;
24360 case ISD::SETULT: Invert = true; [[fallthrough]];
24361 case ISD::SETUGE: Opc = ISD::UMAX; break;
24362 // clang-format on
24363 }
24364
24365 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24366 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24367
24368 // If the logical-not of the result is required, perform that now.
24369 if (Invert)
24370 Result = DAG.getNOT(dl, Result, VT);
24371
24372 return Result;
24373 }
24374
24375 // Try to use SUBUS and PCMPEQ.
24376 if (FlipSigns)
24377 if (SDValue V =
24378 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24379 return V;
24380
24381 // We are handling one of the integer comparisons here. Since SSE only has
24382 // GT and EQ comparisons for integer, swapping operands and multiple
24383 // operations may be required for some comparisons.
24384 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24386 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24388 bool Invert = Cond == ISD::SETNE ||
24390
24391 if (Swap)
24392 std::swap(Op0, Op1);
24393
24394 // Check that the operation in question is available (most are plain SSE2,
24395 // but PCMPGTQ and PCMPEQQ have different requirements).
24396 if (VT == MVT::v2i64) {
24397 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24398 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24399
24400 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24401 // the odd elements over the even elements.
24402 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24403 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24404 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24405
24406 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24407 static const int MaskHi[] = { 1, 1, 3, 3 };
24408 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24409
24410 return DAG.getBitcast(VT, Result);
24411 }
24412
24413 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24414 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24415 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24416
24417 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24418 static const int MaskHi[] = { 1, 1, 3, 3 };
24419 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24420
24421 return DAG.getBitcast(VT, Result);
24422 }
24423
24424 // If the i64 elements are sign-extended enough to be representable as i32
24425 // then we can compare the lower i32 bits and splat.
24426 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24427 DAG.ComputeNumSignBits(Op1) > 32) {
24428 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24429 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24430
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 static const int MaskLo[] = {0, 0, 2, 2};
24433 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24434
24435 return DAG.getBitcast(VT, Result);
24436 }
24437
24438 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24439 // bits of the inputs before performing those operations. The lower
24440 // compare is always unsigned.
24441 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24442 : 0x0000000080000000ULL,
24443 dl, MVT::v2i64);
24444
24445 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24446 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24447
24448 // Cast everything to the right type.
24449 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24450 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24451
24452 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24453 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24454 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24455
24456 // Create masks for only the low parts/high parts of the 64 bit integers.
24457 static const int MaskHi[] = { 1, 1, 3, 3 };
24458 static const int MaskLo[] = { 0, 0, 2, 2 };
24459 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24460 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24461 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24462
24463 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24464 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24465
24466 if (Invert)
24467 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24468
24469 return DAG.getBitcast(VT, Result);
24470 }
24471
24472 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24473 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24474 // pcmpeqd + pshufd + pand.
24475 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24476
24477 // First cast everything to the right type.
24478 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24479 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24480
24481 // Do the compare.
24482 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24483
24484 // Make sure the lower and upper halves are both all-ones.
24485 static const int Mask[] = { 1, 0, 3, 2 };
24486 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24487 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24488
24489 if (Invert)
24490 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24491
24492 return DAG.getBitcast(VT, Result);
24493 }
24494 }
24495
24496 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24497 // bits of the inputs before performing those operations.
24498 if (FlipSigns) {
24499 MVT EltVT = VT.getVectorElementType();
24501 VT);
24502 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24503 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24504 }
24505
24506 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24507
24508 // If the logical-not of the result is required, perform that now.
24509 if (Invert)
24510 Result = DAG.getNOT(dl, Result, VT);
24511
24512 return Result;
24513}
24514
24515// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24517 const SDLoc &dl, SelectionDAG &DAG,
24518 const X86Subtarget &Subtarget,
24519 SDValue &X86CC) {
24520 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24521
24522 // Must be a bitcast from vXi1.
24523 if (Op0.getOpcode() != ISD::BITCAST)
24524 return SDValue();
24525
24526 Op0 = Op0.getOperand(0);
24527 MVT VT = Op0.getSimpleValueType();
24528 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24529 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24530 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24531 return SDValue();
24532
24533 X86::CondCode X86Cond;
24534 if (isNullConstant(Op1)) {
24535 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24536 } else if (isAllOnesConstant(Op1)) {
24537 // C flag is set for all ones.
24538 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24539 } else
24540 return SDValue();
24541
24542 // If the input is an AND, we can combine it's operands into the KTEST.
24543 bool KTestable = false;
24544 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24545 KTestable = true;
24546 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24547 KTestable = true;
24548 if (!isNullConstant(Op1))
24549 KTestable = false;
24550 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24551 SDValue LHS = Op0.getOperand(0);
24552 SDValue RHS = Op0.getOperand(1);
24553 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24554 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24555 }
24556
24557 // If the input is an OR, we can combine it's operands into the KORTEST.
24558 SDValue LHS = Op0;
24559 SDValue RHS = Op0;
24560 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24561 LHS = Op0.getOperand(0);
24562 RHS = Op0.getOperand(1);
24563 }
24564
24565 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24566 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24567}
24568
24569/// Emit flags for the given setcc condition and operands. Also returns the
24570/// corresponding X86 condition code constant in X86CC.
24571SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24572 ISD::CondCode CC, const SDLoc &dl,
24573 SelectionDAG &DAG,
24574 SDValue &X86CC) const {
24575 // Equality Combines.
24576 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24577 X86::CondCode X86CondCode;
24578
24579 // Optimize to BT if possible.
24580 // Lower (X & (1 << N)) == 0 to BT(X, N).
24581 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24582 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24583 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24584 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24585 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24586 return BT;
24587 }
24588 }
24589
24590 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24591 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24592 X86CondCode)) {
24593 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24594 return CmpZ;
24595 }
24596
24597 // Try to lower using KORTEST or KTEST.
24598 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24599 return Test;
24600
24601 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24602 // of these.
24603 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24604 // If the input is a setcc, then reuse the input setcc or use a new one
24605 // with the inverted condition.
24606 if (Op0.getOpcode() == X86ISD::SETCC) {
24607 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24608
24609 X86CC = Op0.getOperand(0);
24610 if (Invert) {
24611 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24612 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24613 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24614 }
24615
24616 return Op0.getOperand(1);
24617 }
24618 }
24619
24620 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24621 // overflow.
24622 if (isMinSignedConstant(Op1)) {
24623 EVT VT = Op0.getValueType();
24624 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24625 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24627 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24628 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24629 DAG.getConstant(0, dl, VT), Op0);
24630 return SDValue(Neg.getNode(), 1);
24631 }
24632 }
24633
24634 // Try to use the carry flag from the add in place of an separate CMP for:
24635 // (seteq (add X, -1), -1). Similar for setne.
24636 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24637 Op0.getOperand(1) == Op1) {
24638 if (isProfitableToUseFlagOp(Op0)) {
24639 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24640
24641 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24642 Op0.getOperand(1));
24643 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24644 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24645 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24646 return SDValue(New.getNode(), 1);
24647 }
24648 }
24649 }
24650
24652 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24653 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24654
24655 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24656 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24657 return EFLAGS;
24658}
24659
24660SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24661
24662 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24663 Op.getOpcode() == ISD::STRICT_FSETCCS;
24664 MVT VT = Op->getSimpleValueType(0);
24665
24666 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24667
24668 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24669 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24670 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24671 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24672 SDLoc dl(Op);
24673 ISD::CondCode CC =
24674 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24675
24676 if (isSoftF16(Op0.getValueType(), Subtarget))
24677 return SDValue();
24678
24679 // Handle f128 first, since one possible outcome is a normal integer
24680 // comparison which gets handled by emitFlagsForSetcc.
24681 if (Op0.getValueType() == MVT::f128) {
24682 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24683 Op.getOpcode() == ISD::STRICT_FSETCCS);
24684
24685 // If softenSetCCOperands returned a scalar, use it.
24686 if (!Op1.getNode()) {
24687 assert(Op0.getValueType() == Op.getValueType() &&
24688 "Unexpected setcc expansion!");
24689 if (IsStrict)
24690 return DAG.getMergeValues({Op0, Chain}, dl);
24691 return Op0;
24692 }
24693 }
24694
24695 if (Op0.getSimpleValueType().isInteger()) {
24696 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24697 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24698 // this may translate to less uops depending on uarch implementation. The
24699 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24700 // canonicalize to that CondCode.
24701 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24702 // encoding size - so it must either already be a i8 or i32 immediate, or it
24703 // shrinks down to that. We don't do this for any i64's to avoid additional
24704 // constant materializations.
24705 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24706 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24707 const APInt &Op1Val = Op1C->getAPIntValue();
24708 if (!Op1Val.isZero()) {
24709 // Ensure the constant+1 doesn't overflow.
24710 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24711 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24712 APInt Op1ValPlusOne = Op1Val + 1;
24713 if (Op1ValPlusOne.isSignedIntN(32) &&
24714 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24715 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24718 }
24719 }
24720 }
24721 }
24722
24723 SDValue X86CC;
24724 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24725 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24726 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24727 }
24728
24729 if (Subtarget.hasAVX10_2()) {
24730 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24731 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24732 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24733 if (Op0.getSimpleValueType() != MVT::f80) {
24734 SDValue Res = getSETCC(
24735 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737 }
24738 }
24739 }
24740 // Handle floating point.
24741 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24742 if (CondCode == X86::COND_INVALID)
24743 return SDValue();
24744
24745 SDValue EFLAGS;
24746 if (IsStrict) {
24747 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24748 EFLAGS =
24750 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24751 Chain = EFLAGS.getValue(1);
24752 } else {
24753 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24754 }
24755
24756 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24757 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24758 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24759}
24760
24761SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24762 SDValue LHS = Op.getOperand(0);
24763 SDValue RHS = Op.getOperand(1);
24764 SDValue Carry = Op.getOperand(2);
24765 SDValue Cond = Op.getOperand(3);
24766 SDLoc DL(Op);
24767
24768 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24770
24771 // Recreate the carry if needed.
24772 EVT CarryVT = Carry.getValueType();
24773 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24774 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24775
24776 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24777 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24778 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24779}
24780
24781// This function returns three things: the arithmetic computation itself
24782// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24783// flag and the condition code define the case in which the arithmetic
24784// computation overflows.
24785static std::pair<SDValue, SDValue>
24787 assert(Op.getResNo() == 0 && "Unexpected result number!");
24788 SDValue Value, Overflow;
24789 SDValue LHS = Op.getOperand(0);
24790 SDValue RHS = Op.getOperand(1);
24791 unsigned BaseOp = 0;
24792 SDLoc DL(Op);
24793 switch (Op.getOpcode()) {
24794 default: llvm_unreachable("Unknown ovf instruction!");
24795 case ISD::SADDO:
24796 BaseOp = X86ISD::ADD;
24797 Cond = X86::COND_O;
24798 break;
24799 case ISD::UADDO:
24800 BaseOp = X86ISD::ADD;
24802 break;
24803 case ISD::SSUBO:
24804 BaseOp = X86ISD::SUB;
24805 Cond = X86::COND_O;
24806 break;
24807 case ISD::USUBO:
24808 BaseOp = X86ISD::SUB;
24809 Cond = X86::COND_B;
24810 break;
24811 case ISD::SMULO:
24812 BaseOp = X86ISD::SMUL;
24813 Cond = X86::COND_O;
24814 break;
24815 case ISD::UMULO:
24816 BaseOp = X86ISD::UMUL;
24817 Cond = X86::COND_O;
24818 break;
24819 }
24820
24821 if (BaseOp) {
24822 // Also sets EFLAGS.
24823 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24824 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24825 Overflow = Value.getValue(1);
24826 }
24827
24828 return std::make_pair(Value, Overflow);
24829}
24830
24832 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24833 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24834 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24835 // has only one use.
24836 SDLoc DL(Op);
24838 SDValue Value, Overflow;
24839 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24840
24841 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24842 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24843 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24844}
24845
24846/// Return true if opcode is a X86 logical comparison.
24848 unsigned Opc = Op.getOpcode();
24849 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24850 Opc == X86ISD::FCMP)
24851 return true;
24852 if (Op.getResNo() == 1 &&
24853 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24855 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24856 return true;
24857
24858 return false;
24859}
24860
24862 if (V.getOpcode() != ISD::TRUNCATE)
24863 return false;
24864
24865 SDValue VOp0 = V.getOperand(0);
24866 unsigned InBits = VOp0.getValueSizeInBits();
24867 unsigned Bits = V.getValueSizeInBits();
24868 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24869}
24870
24871// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24873 unsigned X86CC, const SDLoc &DL,
24874 SelectionDAG &DAG,
24875 const X86Subtarget &Subtarget) {
24876 EVT CmpVT = CmpVal.getValueType();
24877 EVT VT = LHS.getValueType();
24878 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24879 return SDValue();
24880
24881 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24882 isOneConstant(CmpVal.getOperand(1))) {
24883 auto SplatLSB = [&](EVT SplatVT) {
24884 // we need mask of all zeros or ones with same size of the other
24885 // operands.
24886 SDValue Neg = CmpVal;
24887 if (CmpVT.bitsGT(SplatVT))
24888 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24889 else if (CmpVT.bitsLT(SplatVT))
24890 Neg = DAG.getNode(
24891 ISD::AND, DL, SplatVT,
24892 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24893 DAG.getConstant(1, DL, SplatVT));
24894 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24895 };
24896
24897 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24899 return SplatLSB(VT);
24900
24901 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24902 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24904 SDValue Mask = SplatLSB(VT);
24905 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24906 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24907 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24908 }
24909
24910 SDValue Src1, Src2;
24911 auto isIdentityPatternZero = [&]() {
24912 switch (RHS.getOpcode()) {
24913 default:
24914 break;
24915 case ISD::OR:
24916 case ISD::XOR:
24917 case ISD::ADD:
24918 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24919 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24920 Src2 = LHS;
24921 return true;
24922 }
24923 break;
24924 case ISD::SHL:
24925 case ISD::SRA:
24926 case ISD::SRL:
24927 case ISD::SUB:
24928 if (RHS.getOperand(0) == LHS) {
24929 Src1 = RHS.getOperand(1);
24930 Src2 = LHS;
24931 return true;
24932 }
24933 break;
24934 }
24935 return false;
24936 };
24937
24938 auto isIdentityPatternOnes = [&]() {
24939 switch (LHS.getOpcode()) {
24940 default:
24941 break;
24942 case ISD::AND:
24943 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24944 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24945 Src2 = RHS;
24946 return true;
24947 }
24948 break;
24949 }
24950 return false;
24951 };
24952
24953 // Convert 'identity' patterns (iff X is 0 or 1):
24954 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24960 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24961 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24962 SDValue Mask = SplatLSB(Src1.getValueType());
24963 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24964 Src1); // Mask & z
24965 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24966 }
24967 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24968 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24969 SDValue Mask = SplatLSB(VT);
24970 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24971 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24972 }
24973 }
24974
24975 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24978 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24979
24980 // 'X - 1' sets the carry flag if X == 0.
24981 // '0 - X' sets the carry flag if X != 0.
24982 // Convert the carry flag to a -1/0 mask with sbb:
24983 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24984 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24985 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24986 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24987 SDValue Sub;
24988 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24989 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24990 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24991 } else {
24992 SDValue One = DAG.getConstant(1, DL, CmpVT);
24993 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24994 }
24995 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24996 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24997 Sub.getValue(1));
24998 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24999 }
25000
25001 return SDValue();
25002}
25003
25004SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25005 bool AddTest = true;
25006 SDValue Cond = Op.getOperand(0);
25007 SDValue Op1 = Op.getOperand(1);
25008 SDValue Op2 = Op.getOperand(2);
25009 SDLoc DL(Op);
25010 MVT VT = Op1.getSimpleValueType();
25011 SDValue CC;
25012
25013 if (isSoftF16(VT, Subtarget)) {
25014 MVT NVT = VT.changeTypeToInteger();
25015 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25016 DAG.getBitcast(NVT, Op1),
25017 DAG.getBitcast(NVT, Op2)));
25018 }
25019
25020 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25021 // are available or VBLENDV if AVX is available.
25022 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25023 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25024 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25025 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25026 bool IsAlwaysSignaling;
25027 unsigned SSECC =
25028 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25029 CondOp0, CondOp1, IsAlwaysSignaling);
25030
25031 if (Subtarget.hasAVX512()) {
25032 SDValue Cmp =
25033 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25034 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25035 assert(!VT.isVector() && "Not a scalar type?");
25036 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25037 }
25038
25039 if (SSECC < 8 || Subtarget.hasAVX()) {
25040 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25041 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25042
25043 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25044 // instead of 3 logic instructions for size savings and potentially speed.
25045 // Unfortunately, there is no scalar form of VBLENDV.
25046 //
25047 // If either operand is a +0.0 constant, don't try this. We can expect to
25048 // optimize away at least one of the logic instructions later in that
25049 // case, so that sequence would be faster than a variable blend.
25050 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25051 !isNullFPConstant(Op2)) {
25052 // Convert to vectors, do a VSELECT, and convert back to scalar.
25053 // All of the conversions should be optimized away.
25054 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25055 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25056 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25057 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25058
25059 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25060 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25061
25062 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25063
25064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25065 DAG.getVectorIdxConstant(0, DL));
25066 }
25067 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25068 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25069 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25070 }
25071 }
25072
25073 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25074 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25075 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25076 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25077 }
25078
25079 if (Cond.getOpcode() == ISD::SETCC &&
25080 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25081 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25082 Cond = NewCond;
25083 // If the condition was updated, it's possible that the operands of the
25084 // select were also updated (for example, EmitTest has a RAUW). Refresh
25085 // the local references to the select operands in case they got stale.
25086 Op1 = Op.getOperand(1);
25087 Op2 = Op.getOperand(2);
25088 }
25089 }
25090
25091 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25092 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25093 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25094 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25095 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25096 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25097 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25098 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25099 if (Cond.getOpcode() == X86ISD::SETCC &&
25100 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25101 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25102 SDValue Cmp = Cond.getOperand(1);
25103 SDValue CmpOp0 = Cmp.getOperand(0);
25104 unsigned CondCode = Cond.getConstantOperandVal(0);
25105
25106 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25107 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25108 // handle to keep the CMP with 0. This should be removed by
25109 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25110 // cttz_zero_undef.
25111 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25112 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25113 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25114 };
25115 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25116 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25117 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25118 // Keep Cmp.
25119 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25120 DL, DAG, Subtarget)) {
25121 return R;
25122 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25123 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25124 ((CondCode == X86::COND_S) || // smin(x, 0)
25125 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25126 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25127 //
25128 // If the comparison is testing for a positive value, we have to invert
25129 // the sign bit mask, so only do that transform if the target has a
25130 // bitwise 'and not' instruction (the invert is free).
25131 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25132 unsigned ShCt = VT.getSizeInBits() - 1;
25133 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25134 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25135 if (CondCode == X86::COND_G)
25136 Shift = DAG.getNOT(DL, Shift, VT);
25137 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25138 }
25139 }
25140
25141 // Look past (and (setcc_carry (cmp ...)), 1).
25142 if (Cond.getOpcode() == ISD::AND &&
25143 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25144 isOneConstant(Cond.getOperand(1)))
25145 Cond = Cond.getOperand(0);
25146
25147 // Attempt to fold "raw cond" cases by treating them as:
25148 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25149 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25150 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25151 Subtarget))
25152 return R;
25153
25154 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25155 // setting operand in place of the X86ISD::SETCC.
25156 unsigned CondOpcode = Cond.getOpcode();
25157 if (CondOpcode == X86ISD::SETCC ||
25158 CondOpcode == X86ISD::SETCC_CARRY) {
25159 CC = Cond.getOperand(0);
25160
25161 SDValue Cmp = Cond.getOperand(1);
25162 bool IllegalFPCMov = false;
25163 if (VT.isFloatingPoint() && !VT.isVector() &&
25164 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25165 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25166
25167 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25168 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25169 Cond = Cmp;
25170 AddTest = false;
25171 }
25172 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25173 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25174 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25175 SDValue Value;
25176 X86::CondCode X86Cond;
25177 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25178
25179 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25180 AddTest = false;
25181 }
25182
25183 if (AddTest) {
25184 // Look past the truncate if the high bits are known zero.
25186 Cond = Cond.getOperand(0);
25187
25188 // We know the result of AND is compared against zero. Try to match
25189 // it to BT.
25190 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25191 X86::CondCode X86CondCode;
25192 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25193 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25194 Cond = BT;
25195 AddTest = false;
25196 }
25197 }
25198 }
25199
25200 if (AddTest) {
25201 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25202 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25203 }
25204
25205 // a < b ? -1 : 0 -> RES = ~setcc_carry
25206 // a < b ? 0 : -1 -> RES = setcc_carry
25207 // a >= b ? -1 : 0 -> RES = setcc_carry
25208 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25209 if (Cond.getOpcode() == X86ISD::SUB) {
25210 unsigned CondCode = CC->getAsZExtVal();
25211
25212 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25213 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25214 (isNullConstant(Op1) || isNullConstant(Op2))) {
25215 SDValue Res =
25216 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25217 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25218 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25219 return DAG.getNOT(DL, Res, Res.getValueType());
25220 return Res;
25221 }
25222 }
25223
25224 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25225 // widen the cmov and push the truncate through. This avoids introducing a new
25226 // branch during isel and doesn't add any extensions.
25227 if (Op.getValueType() == MVT::i8 &&
25228 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25229 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25230 if (T1.getValueType() == T2.getValueType() &&
25231 // Exclude CopyFromReg to avoid partial register stalls.
25232 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25233 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25234 CC, Cond);
25235 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25236 }
25237 }
25238
25239 // Or finally, promote i8 cmovs if we have CMOV,
25240 // or i16 cmovs if it won't prevent folding a load.
25241 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25242 // legal, but EmitLoweredSelect() can not deal with these extensions
25243 // being inserted between two CMOV's. (in i16 case too TBN)
25244 // https://bugs.llvm.org/show_bug.cgi?id=40974
25245 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25246 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25247 !X86::mayFoldLoad(Op2, Subtarget))) {
25248 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25249 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25250 SDValue Ops[] = { Op2, Op1, CC, Cond };
25251 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25252 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25253 }
25254
25255 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25256 // condition is true.
25257 SDValue Ops[] = { Op2, Op1, CC, Cond };
25258 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25259}
25260
25262 const X86Subtarget &Subtarget,
25263 SelectionDAG &DAG) {
25264 MVT VT = Op->getSimpleValueType(0);
25265 SDValue In = Op->getOperand(0);
25266 MVT InVT = In.getSimpleValueType();
25267 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25268 MVT VTElt = VT.getVectorElementType();
25269 unsigned NumElts = VT.getVectorNumElements();
25270
25271 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25272 MVT ExtVT = VT;
25273 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25274 // If v16i32 is to be avoided, we'll need to split and concatenate.
25275 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25276 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25277
25278 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25279 }
25280
25281 // Widen to 512-bits if VLX is not supported.
25282 MVT WideVT = ExtVT;
25283 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25284 NumElts *= 512 / ExtVT.getSizeInBits();
25285 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25286 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25287 DAG.getVectorIdxConstant(0, dl));
25288 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25289 }
25290
25291 SDValue V;
25292 MVT WideEltVT = WideVT.getVectorElementType();
25293 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25294 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25295 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25296 } else {
25297 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25298 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25299 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25300 }
25301
25302 // Truncate if we had to extend i16/i8 above.
25303 if (VT != ExtVT) {
25304 WideVT = MVT::getVectorVT(VTElt, NumElts);
25305 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25306 }
25307
25308 // Extract back to 128/256-bit if we widened.
25309 if (WideVT != VT)
25310 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25311 DAG.getVectorIdxConstant(0, dl));
25312
25313 return V;
25314}
25315
25317 SelectionDAG &DAG) {
25318 SDValue In = Op->getOperand(0);
25319 MVT InVT = In.getSimpleValueType();
25320 SDLoc DL(Op);
25321
25322 if (InVT.getVectorElementType() == MVT::i1)
25323 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25324
25325 assert(Subtarget.hasAVX() && "Expected AVX support");
25326 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25327}
25328
25329// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25330// For sign extend this needs to handle all vector sizes and SSE4.1 and
25331// non-SSE4.1 targets. For zero extend this should only handle inputs of
25332// MVT::v64i8 when BWI is not supported, but AVX512 is.
25334 const X86Subtarget &Subtarget,
25335 SelectionDAG &DAG) {
25336 SDValue In = Op->getOperand(0);
25337 MVT VT = Op->getSimpleValueType(0);
25338 MVT InVT = In.getSimpleValueType();
25339
25340 MVT SVT = VT.getVectorElementType();
25341 MVT InSVT = InVT.getVectorElementType();
25343
25344 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25345 return SDValue();
25346 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25347 return SDValue();
25348 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25349 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25350 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25351 return SDValue();
25352
25353 SDLoc dl(Op);
25354 unsigned Opc = Op.getOpcode();
25355 unsigned NumElts = VT.getVectorNumElements();
25356
25357 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25358 // For 512-bit vectors, we need 128-bits or 256-bits.
25359 if (InVT.getSizeInBits() > 128) {
25360 // Input needs to be at least the same number of elements as output, and
25361 // at least 128-bits.
25362 int InSize = InSVT.getSizeInBits() * NumElts;
25363 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25364 InVT = In.getSimpleValueType();
25365 }
25366
25367 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25368 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25369 // need to be handled here for 256/512-bit results.
25370 if (Subtarget.hasInt256()) {
25371 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25372
25373 if (InVT.getVectorNumElements() != NumElts)
25374 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25375
25376 // FIXME: Apparently we create inreg operations that could be regular
25377 // extends.
25378 unsigned ExtOpc =
25381 return DAG.getNode(ExtOpc, dl, VT, In);
25382 }
25383
25384 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25385 if (Subtarget.hasAVX()) {
25386 assert(VT.is256BitVector() && "256-bit vector expected");
25387 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25388 int HalfNumElts = HalfVT.getVectorNumElements();
25389
25390 unsigned NumSrcElts = InVT.getVectorNumElements();
25391 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25392 for (int i = 0; i != HalfNumElts; ++i)
25393 HiMask[i] = HalfNumElts + i;
25394
25395 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25396 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25397 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25398 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25399 }
25400
25401 // We should only get here for sign extend.
25402 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25403 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25404 unsigned InNumElts = InVT.getVectorNumElements();
25405
25406 // If the source elements are already all-signbits, we don't need to extend,
25407 // just splat the elements.
25408 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25409 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25410 unsigned Scale = InNumElts / NumElts;
25411 SmallVector<int, 16> ShuffleMask;
25412 for (unsigned I = 0; I != NumElts; ++I)
25413 ShuffleMask.append(Scale, I);
25414 return DAG.getBitcast(VT,
25415 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25416 }
25417
25418 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25419 SDValue Curr = In;
25420 SDValue SignExt = Curr;
25421
25422 // As SRAI is only available on i16/i32 types, we expand only up to i32
25423 // and handle i64 separately.
25424 if (InVT != MVT::v4i32) {
25425 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25426
25427 unsigned DestWidth = DestVT.getScalarSizeInBits();
25428 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25429 unsigned DestElts = DestVT.getVectorNumElements();
25430
25431 // Build a shuffle mask that takes each input element and places it in the
25432 // MSBs of the new element size.
25433 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25434 for (unsigned i = 0; i != DestElts; ++i)
25435 Mask[i * Scale + (Scale - 1)] = i;
25436
25437 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25438 Curr = DAG.getBitcast(DestVT, Curr);
25439
25440 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25441 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25442 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25443 }
25444
25445 if (VT == MVT::v2i64) {
25446 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25447 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25448 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25449 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25450 SignExt = DAG.getBitcast(VT, SignExt);
25451 }
25452
25453 return SignExt;
25454}
25455
25457 SelectionDAG &DAG) {
25458 MVT VT = Op->getSimpleValueType(0);
25459 SDValue In = Op->getOperand(0);
25460 MVT InVT = In.getSimpleValueType();
25461 SDLoc dl(Op);
25462
25463 if (InVT.getVectorElementType() == MVT::i1)
25464 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25465
25466 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25468 "Expected same number of elements");
25469 assert((VT.getVectorElementType() == MVT::i16 ||
25470 VT.getVectorElementType() == MVT::i32 ||
25471 VT.getVectorElementType() == MVT::i64) &&
25472 "Unexpected element type");
25473 assert((InVT.getVectorElementType() == MVT::i8 ||
25474 InVT.getVectorElementType() == MVT::i16 ||
25475 InVT.getVectorElementType() == MVT::i32) &&
25476 "Unexpected element type");
25477
25478 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25479 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25480 return splitVectorIntUnary(Op, DAG, dl);
25481 }
25482
25483 if (Subtarget.hasInt256())
25484 return Op;
25485
25486 // Optimize vectors in AVX mode
25487 // Sign extend v8i16 to v8i32 and
25488 // v4i32 to v4i64
25489 //
25490 // Divide input vector into two parts
25491 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25492 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25493 // concat the vectors to original VT
25494 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25495 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25496
25497 unsigned NumElems = InVT.getVectorNumElements();
25498 SmallVector<int,8> ShufMask(NumElems, -1);
25499 for (unsigned i = 0; i != NumElems/2; ++i)
25500 ShufMask[i] = i + NumElems/2;
25501
25502 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25503 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25504
25505 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25506}
25507
25508/// Change a vector store into a pair of half-size vector stores.
25510 SDValue StoredVal = Store->getValue();
25511 assert((StoredVal.getValueType().is256BitVector() ||
25512 StoredVal.getValueType().is512BitVector()) &&
25513 "Expecting 256/512-bit op");
25514
25515 // Splitting volatile memory ops is not allowed unless the operation was not
25516 // legal to begin with. Assume the input store is legal (this transform is
25517 // only used for targets with AVX). Note: It is possible that we have an
25518 // illegal type like v2i128, and so we could allow splitting a volatile store
25519 // in that case if that is important.
25520 if (!Store->isSimple())
25521 return SDValue();
25522
25523 SDLoc DL(Store);
25524 SDValue Value0, Value1;
25525 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25526 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25527 SDValue Ptr0 = Store->getBasePtr();
25528 SDValue Ptr1 =
25529 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25530 SDValue Ch0 =
25531 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25532 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25533 SDValue Ch1 =
25534 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25535 Store->getPointerInfo().getWithOffset(HalfOffset),
25536 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25537 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25538}
25539
25540/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25541/// type.
25543 SelectionDAG &DAG) {
25544 SDValue StoredVal = Store->getValue();
25545 assert(StoreVT.is128BitVector() &&
25546 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25547 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25548
25549 // Splitting volatile memory ops is not allowed unless the operation was not
25550 // legal to begin with. We are assuming the input op is legal (this transform
25551 // is only used for targets with AVX).
25552 if (!Store->isSimple())
25553 return SDValue();
25554
25555 MVT StoreSVT = StoreVT.getScalarType();
25556 unsigned NumElems = StoreVT.getVectorNumElements();
25557 unsigned ScalarSize = StoreSVT.getStoreSize();
25558
25559 SDLoc DL(Store);
25561 for (unsigned i = 0; i != NumElems; ++i) {
25562 unsigned Offset = i * ScalarSize;
25563 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25565 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25566 DAG.getVectorIdxConstant(i, DL));
25567 SDValue Ch =
25568 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25569 Store->getPointerInfo().getWithOffset(Offset),
25570 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25571 Stores.push_back(Ch);
25572 }
25573 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25574}
25575
25576static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25577 SelectionDAG &DAG) {
25578 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25579 SDLoc dl(St);
25580 SDValue StoredVal = St->getValue();
25581
25582 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25583 if (StoredVal.getValueType().isVector() &&
25584 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25585 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25586 assert(NumElts <= 8 && "Unexpected VT");
25587 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25588 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25589 "Expected AVX512F without AVX512DQI");
25590
25591 // We must pad with zeros to ensure we store zeroes to any unused bits.
25592 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25593 DAG.getUNDEF(MVT::v16i1), StoredVal,
25594 DAG.getVectorIdxConstant(0, dl));
25595 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25596 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25597 // Make sure we store zeros in the extra bits.
25598 if (NumElts < 8)
25599 StoredVal = DAG.getZeroExtendInReg(
25600 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25601
25602 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25603 St->getPointerInfo(), St->getBaseAlign(),
25604 St->getMemOperand()->getFlags());
25605 }
25606
25607 if (St->isTruncatingStore())
25608 return SDValue();
25609
25610 // If this is a 256/512-bit store of concatenated ops, we are better off
25611 // splitting that store into two half-size stores. This avoids spurious use of
25612 // concatenated ops and each half can execute independently. Some cores would
25613 // split the op into halves anyway, so the concat is purely an extra op.
25614 MVT StoreVT = StoredVal.getSimpleValueType();
25615 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25616 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25617 return splitVectorStore(St, DAG);
25618 return SDValue();
25619 }
25620
25621 if (StoreVT.is32BitVector())
25622 return SDValue();
25623
25624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25625 assert(StoreVT.is64BitVector() && "Unexpected VT");
25626 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25628 "Unexpected type action!");
25629
25630 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25631 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25632 DAG.getUNDEF(StoreVT));
25633
25634 if (Subtarget.hasSSE2()) {
25635 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25636 // and store it.
25637 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25638 MVT CastVT = MVT::getVectorVT(StVT, 2);
25639 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25640 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25641 DAG.getVectorIdxConstant(0, dl));
25642
25643 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25644 St->getPointerInfo(), St->getBaseAlign(),
25645 St->getMemOperand()->getFlags());
25646 }
25647 assert(Subtarget.hasSSE1() && "Expected SSE");
25648 SDVTList Tys = DAG.getVTList(MVT::Other);
25649 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25650 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25651 St->getMemOperand());
25652}
25653
25654// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25655// may emit an illegal shuffle but the expansion is still better than scalar
25656// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25657// we'll emit a shuffle and a arithmetic shift.
25658// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25659// TODO: It is possible to support ZExt by zeroing the undef values during
25660// the shuffle phase or after the shuffle.
25661static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25662 SelectionDAG &DAG) {
25663 MVT RegVT = Op.getSimpleValueType();
25664 assert(RegVT.isVector() && "We only custom lower vector loads.");
25665 assert(RegVT.isInteger() &&
25666 "We only custom lower integer vector loads.");
25667
25668 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25669 SDLoc dl(Ld);
25670
25671 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25672 if (RegVT.getVectorElementType() == MVT::i1) {
25673 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25674 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25675 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25676 "Expected AVX512F without AVX512DQI");
25677
25678 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25679 Ld->getPointerInfo(), Ld->getBaseAlign(),
25680 Ld->getMemOperand()->getFlags());
25681
25682 // Replace chain users with the new chain.
25683 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25684
25685 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25686 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25687 DAG.getBitcast(MVT::v16i1, Val),
25688 DAG.getVectorIdxConstant(0, dl));
25689 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25690 }
25691
25692 return SDValue();
25693}
25694
25695/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25696/// each of which has no other use apart from the AND / OR.
25697static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25698 Opc = Op.getOpcode();
25699 if (Opc != ISD::OR && Opc != ISD::AND)
25700 return false;
25701 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25702 Op.getOperand(0).hasOneUse() &&
25703 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25704 Op.getOperand(1).hasOneUse());
25705}
25706
25707SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25708 SDValue Chain = Op.getOperand(0);
25709 SDValue Cond = Op.getOperand(1);
25710 SDValue Dest = Op.getOperand(2);
25711 SDLoc dl(Op);
25712
25713 // Bail out when we don't have native compare instructions.
25714 if (Cond.getOpcode() == ISD::SETCC &&
25715 Cond.getOperand(0).getValueType() != MVT::f128 &&
25716 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25717 SDValue LHS = Cond.getOperand(0);
25718 SDValue RHS = Cond.getOperand(1);
25719 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25720
25721 // Special case for
25722 // setcc([su]{add,sub,mul}o == 0)
25723 // setcc([su]{add,sub,mul}o != 1)
25725 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25727 SDValue Value, Overflow;
25728 X86::CondCode X86Cond;
25729 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25730
25731 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25732 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25733
25734 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25735 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25736 Overflow, Op->getFlags());
25737 }
25738
25739 if (LHS.getSimpleValueType().isInteger()) {
25740 SDValue CCVal;
25741 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25742 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25743 EFLAGS, Op->getFlags());
25744 }
25745
25746 if (CC == ISD::SETOEQ) {
25747 // For FCMP_OEQ, we can emit
25748 // two branches instead of an explicit AND instruction with a
25749 // separate test. However, we only do this if this block doesn't
25750 // have a fall-through edge, because this requires an explicit
25751 // jmp when the condition is false.
25752 if (Op.getNode()->hasOneUse()) {
25753 SDNode *User = *Op.getNode()->user_begin();
25754 // Look for an unconditional branch following this conditional branch.
25755 // We need this because we need to reverse the successors in order
25756 // to implement FCMP_OEQ.
25757 if (User->getOpcode() == ISD::BR) {
25758 SDValue FalseBB = User->getOperand(1);
25759 SDNode *NewBR =
25760 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25761 assert(NewBR == User);
25762 (void)NewBR;
25763 Dest = FalseBB;
25764
25765 SDValue Cmp =
25766 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25767 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25768 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25769 CCVal, Cmp, Op->getFlags());
25770 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25771 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25772 Cmp, Op->getFlags());
25773 }
25774 }
25775 } else if (CC == ISD::SETUNE) {
25776 // For FCMP_UNE, we can emit
25777 // two branches instead of an explicit OR instruction with a
25778 // separate test.
25779 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25780 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25781 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25782 Cmp, Op->getFlags());
25783 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25784 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25785 Cmp, Op->getFlags());
25786 } else {
25787 X86::CondCode X86Cond =
25788 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25789 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25790 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25791 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25792 Cmp, Op->getFlags());
25793 }
25794 }
25795
25797 SDValue Value, Overflow;
25798 X86::CondCode X86Cond;
25799 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25800
25801 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25802 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25803 Overflow, Op->getFlags());
25804 }
25805
25806 // Look past the truncate if the high bits are known zero.
25808 Cond = Cond.getOperand(0);
25809
25810 EVT CondVT = Cond.getValueType();
25811
25812 // Add an AND with 1 if we don't already have one.
25813 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25814 Cond =
25815 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25816
25817 SDValue LHS = Cond;
25818 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25819
25820 SDValue CCVal;
25821 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25822 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25823 Op->getFlags());
25824}
25825
25826// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25827// Calls to _alloca are needed to probe the stack when allocating more than 4k
25828// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25829// that the guard pages used by the OS virtual memory manager are allocated in
25830// correct sequence.
25831SDValue
25832X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25833 SelectionDAG &DAG) const {
25834 MachineFunction &MF = DAG.getMachineFunction();
25835 bool SplitStack = MF.shouldSplitStack();
25836 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25837 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25838 SplitStack || EmitStackProbeCall;
25839 SDLoc dl(Op);
25840
25841 // Get the inputs.
25842 SDNode *Node = Op.getNode();
25843 SDValue Chain = Op.getOperand(0);
25844 SDValue Size = Op.getOperand(1);
25845 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25846 EVT VT = Node->getValueType(0);
25847
25848 // Chain the dynamic stack allocation so that it doesn't modify the stack
25849 // pointer when other instructions are using the stack.
25850 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25851
25852 bool Is64Bit = Subtarget.is64Bit();
25853 MVT SPTy = Op.getValueType().getSimpleVT();
25854
25856 if (!Lower) {
25857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25859 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25860 " not tell us which reg is the stack pointer!");
25861
25862 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25863 const Align StackAlign = TFI.getStackAlign();
25864 if (hasInlineStackProbe(MF)) {
25865 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25866 {Chain, Size});
25867 Chain = Result.getValue(1);
25868 } else {
25869 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25870 Chain = SP.getValue(1);
25871 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25872 }
25873 if (Alignment && *Alignment > StackAlign)
25874 Result = DAG.getNode(
25875 ISD::AND, dl, VT, Result,
25876 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25878 } else if (SplitStack) {
25879 if (Is64Bit) {
25880 // The 64 bit implementation of segmented stacks needs to clobber both r10
25881 // r11. This makes it impossible to use it along with nested parameters.
25882 const Function &F = MF.getFunction();
25883 for (const auto &A : F.args()) {
25884 if (A.hasNestAttr())
25885 report_fatal_error("Cannot use segmented stacks with functions that "
25886 "have nested arguments.");
25887 }
25888 }
25889
25890 Result =
25891 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25892 Chain = Result.getValue(1);
25893 } else {
25894 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25895 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25896 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25897
25898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25899 Register SPReg = RegInfo->getStackRegister();
25900 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25901 Chain = SP.getValue(1);
25902
25903 if (Alignment) {
25904 SP = DAG.getNode(
25905 ISD::AND, dl, VT, SP.getValue(0),
25906 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25907 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25908 }
25909
25910 Result = SP;
25911 }
25912
25913 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25914
25915 SDValue Ops[2] = {Result, Chain};
25916 return DAG.getMergeValues(Ops, dl);
25917}
25918
25919SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25920 MachineFunction &MF = DAG.getMachineFunction();
25921 SDValue Ptr = Op.getOperand(1);
25922 EVT PtrVT = Ptr.getValueType();
25923
25924 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25925
25926 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25927 SDLoc DL(Op);
25928
25929 if (!Subtarget.is64Bit() ||
25930 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25931 // vastart just stores the address of the VarArgsFrameIndex slot into the
25932 // memory location argument.
25933 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25934 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25935 }
25936
25937 // __va_list_tag:
25938 // gp_offset (0 - 6 * 8)
25939 // fp_offset (48 - 48 + 8 * 16)
25940 // overflow_arg_area (point to parameters coming in memory).
25941 // reg_save_area
25943 SDValue FIN = Op.getOperand(1);
25944 // Store gp_offset
25945 SDValue Store = DAG.getStore(
25946 Op.getOperand(0), DL,
25947 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25948 MachinePointerInfo(SV));
25949 MemOps.push_back(Store);
25950
25951 // Store fp_offset
25952 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25953 Store = DAG.getStore(
25954 Op.getOperand(0), DL,
25955 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25956 MachinePointerInfo(SV, 4));
25957 MemOps.push_back(Store);
25958
25959 // Store ptr to overflow_arg_area
25960 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25961 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25962 Store =
25963 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25964 MemOps.push_back(Store);
25965
25966 // Store ptr to reg_save_area.
25967 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25968 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25969 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25970 Store = DAG.getStore(
25971 Op.getOperand(0), DL, RSFIN, FIN,
25972 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25973 MemOps.push_back(Store);
25974 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25975}
25976
25977SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25978 assert(Subtarget.is64Bit() &&
25979 "LowerVAARG only handles 64-bit va_arg!");
25980 assert(Op.getNumOperands() == 4);
25981
25982 MachineFunction &MF = DAG.getMachineFunction();
25983 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25984 // The Win64 ABI uses char* instead of a structure.
25985 return DAG.expandVAArg(Op.getNode());
25986
25987 SDValue Chain = Op.getOperand(0);
25988 SDValue SrcPtr = Op.getOperand(1);
25989 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25990 unsigned Align = Op.getConstantOperandVal(3);
25991 SDLoc dl(Op);
25992
25993 EVT ArgVT = Op.getNode()->getValueType(0);
25994 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25995 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25996 uint8_t ArgMode;
25997
25998 // Decide which area this value should be read from.
25999 // TODO: Implement the AMD64 ABI in its entirety. This simple
26000 // selection mechanism works only for the basic types.
26001 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26002 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26003 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26004 } else {
26005 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26006 "Unhandled argument type in LowerVAARG");
26007 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26008 }
26009
26010 if (ArgMode == 2) {
26011 // Make sure using fp_offset makes sense.
26012 assert(!Subtarget.useSoftFloat() &&
26013 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26014 Subtarget.hasSSE1());
26015 }
26016
26017 // Insert VAARG node into the DAG
26018 // VAARG returns two values: Variable Argument Address, Chain
26019 SDValue InstOps[] = {Chain, SrcPtr,
26020 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26021 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26022 DAG.getTargetConstant(Align, dl, MVT::i32)};
26023 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26024 SDValue VAARG = DAG.getMemIntrinsicNode(
26025 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26026 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26027 /*Alignment=*/std::nullopt,
26029 Chain = VAARG.getValue(1);
26030
26031 // Load the next argument and return it
26032 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26033}
26034
26035static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26036 SelectionDAG &DAG) {
26037 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26038 // where a va_list is still an i8*.
26039 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26040 if (Subtarget.isCallingConvWin64(
26042 // Probably a Win64 va_copy.
26043 return DAG.expandVACopy(Op.getNode());
26044
26045 SDValue Chain = Op.getOperand(0);
26046 SDValue DstPtr = Op.getOperand(1);
26047 SDValue SrcPtr = Op.getOperand(2);
26048 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26049 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26050 SDLoc DL(Op);
26051
26052 return DAG.getMemcpy(
26053 Chain, DL, DstPtr, SrcPtr,
26054 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26055 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26056 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26057 MachinePointerInfo(SrcSV));
26058}
26059
26060// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26061static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26062 switch (Opc) {
26063 case ISD::SHL:
26064 case X86ISD::VSHL:
26065 case X86ISD::VSHLI:
26066 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26067 case ISD::SRL:
26068 case X86ISD::VSRL:
26069 case X86ISD::VSRLI:
26070 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26071 case ISD::SRA:
26072 case X86ISD::VSRA:
26073 case X86ISD::VSRAI:
26074 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26075 }
26076 llvm_unreachable("Unknown target vector shift node");
26077}
26078
26079/// Handle vector element shifts where the shift amount is a constant.
26080/// Takes immediate version of shift as input.
26081static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26082 SDValue SrcOp, uint64_t ShiftAmt,
26083 SelectionDAG &DAG) {
26084 MVT ElementType = VT.getVectorElementType();
26085
26086 // Bitcast the source vector to the output type, this is mainly necessary for
26087 // vXi8/vXi64 shifts.
26088 if (VT != SrcOp.getSimpleValueType())
26089 SrcOp = DAG.getBitcast(VT, SrcOp);
26090
26091 // Fold this packed shift into its first operand if ShiftAmt is 0.
26092 if (ShiftAmt == 0)
26093 return SrcOp;
26094
26095 // Check for ShiftAmt >= element width
26096 if (ShiftAmt >= ElementType.getSizeInBits()) {
26097 if (Opc == X86ISD::VSRAI)
26098 ShiftAmt = ElementType.getSizeInBits() - 1;
26099 else
26100 return DAG.getConstant(0, dl, VT);
26101 }
26102
26104 && "Unknown target vector shift-by-constant node");
26105
26106 // Fold this packed vector shift into a build vector if SrcOp is a
26107 // vector of Constants or UNDEFs.
26109 unsigned ShiftOpc;
26110 switch (Opc) {
26111 default: llvm_unreachable("Unknown opcode!");
26112 case X86ISD::VSHLI:
26113 ShiftOpc = ISD::SHL;
26114 break;
26115 case X86ISD::VSRLI:
26116 ShiftOpc = ISD::SRL;
26117 break;
26118 case X86ISD::VSRAI:
26119 ShiftOpc = ISD::SRA;
26120 break;
26121 }
26122
26123 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26124 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26125 return C;
26126 }
26127
26128 return DAG.getNode(Opc, dl, VT, SrcOp,
26129 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26130}
26131
26132/// Handle vector element shifts by a splat shift amount
26133static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26134 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26135 const X86Subtarget &Subtarget,
26136 SelectionDAG &DAG) {
26137 MVT AmtVT = ShAmt.getSimpleValueType();
26138 assert(AmtVT.isVector() && "Vector shift type mismatch");
26139 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26140 "Illegal vector splat index");
26141
26142 // Move the splat element to the bottom element.
26143 if (ShAmtIdx != 0) {
26144 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26145 Mask[0] = ShAmtIdx;
26146 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26147 }
26148
26149 // Peek through any zext node if we can get back to a 128-bit source.
26150 if (AmtVT.getScalarSizeInBits() == 64 &&
26151 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26153 ShAmt.getOperand(0).getValueType().isSimple() &&
26154 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26155 ShAmt = ShAmt.getOperand(0);
26156 AmtVT = ShAmt.getSimpleValueType();
26157 }
26158
26159 // See if we can mask off the upper elements using the existing source node.
26160 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26161 // do this for vXi64 types.
26162 bool IsMasked = false;
26163 if (AmtVT.getScalarSizeInBits() < 64) {
26164 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26165 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26166 // If the shift amount has come from a scalar, then zero-extend the scalar
26167 // before moving to the vector.
26168 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26169 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26170 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26171 AmtVT = MVT::v4i32;
26172 IsMasked = true;
26173 } else if (ShAmt.getOpcode() == ISD::AND) {
26174 // See if the shift amount is already masked (e.g. for rotation modulo),
26175 // then we can zero-extend it by setting all the other mask elements to
26176 // zero.
26177 SmallVector<SDValue> MaskElts(
26178 AmtVT.getVectorNumElements(),
26179 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26180 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26181 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26182 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26183 {ShAmt.getOperand(1), Mask}))) {
26184 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26185 IsMasked = true;
26186 }
26187 }
26188 }
26189
26190 // Extract if the shift amount vector is larger than 128-bits.
26191 if (AmtVT.getSizeInBits() > 128) {
26192 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26193 AmtVT = ShAmt.getSimpleValueType();
26194 }
26195
26196 // Zero-extend bottom element to v2i64 vector type, either by extension or
26197 // shuffle masking.
26198 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26199 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26200 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26201 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26202 } else if (Subtarget.hasSSE41()) {
26203 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26204 MVT::v2i64, ShAmt);
26205 } else {
26206 SDValue ByteShift = DAG.getTargetConstant(
26207 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26208 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26209 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26210 ByteShift);
26211 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26212 ByteShift);
26213 }
26214 }
26215
26216 // Change opcode to non-immediate version.
26218
26219 // The return type has to be a 128-bit type with the same element
26220 // type as the input type.
26221 MVT EltVT = VT.getVectorElementType();
26222 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26223
26224 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26225 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26226}
26227
26228/// Return Mask with the necessary casting or extending
26229/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26230static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26231 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26232 const SDLoc &dl) {
26233
26234 if (isAllOnesConstant(Mask))
26235 return DAG.getConstant(1, dl, MaskVT);
26236 if (X86::isZeroNode(Mask))
26237 return DAG.getConstant(0, dl, MaskVT);
26238
26239 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26240
26241 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26242 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26243 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26244 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26245 SDValue Lo, Hi;
26246 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26247 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26248 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26250 } else {
26251 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26252 Mask.getSimpleValueType().getSizeInBits());
26253 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26254 // are extracted by EXTRACT_SUBVECTOR.
26255 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26256 DAG.getBitcast(BitcastVT, Mask),
26257 DAG.getVectorIdxConstant(0, dl));
26258 }
26259}
26260
26261/// Return (and \p Op, \p Mask) for compare instructions or
26262/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26263/// necessary casting or extending for \p Mask when lowering masking intrinsics
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 MVT VT = Op.getSimpleValueType();
26269 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26270 unsigned OpcodeSelect = ISD::VSELECT;
26271 SDLoc dl(Op);
26272
26273 if (isAllOnesConstant(Mask))
26274 return Op;
26275
26276 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277
26278 if (PreservedSrc.isUndef())
26279 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26280 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26281}
26282
26283/// Creates an SDNode for a predicated scalar operation.
26284/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26285/// The mask is coming as MVT::i8 and it should be transformed
26286/// to MVT::v1i1 while lowering masking intrinsics.
26287/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26288/// "X86select" instead of "vselect". We just can't create the "vselect" node
26289/// for a scalar instruction.
26291 SDValue PreservedSrc,
26292 const X86Subtarget &Subtarget,
26293 SelectionDAG &DAG) {
26294 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26295 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26296 return Op;
26297
26298 MVT VT = Op.getSimpleValueType();
26299 SDLoc dl(Op);
26300
26301 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26302 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26303 DAG.getBitcast(MVT::v8i1, Mask),
26304 DAG.getVectorIdxConstant(0, dl));
26305 if (Op.getOpcode() == X86ISD::FSETCCM ||
26306 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26307 Op.getOpcode() == X86ISD::VFPCLASSS)
26308 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26309
26310 if (PreservedSrc.isUndef())
26311 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26312
26313 if (MaskConst) {
26314 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26315 // Discard op and blend passthrough with scalar op src/dst.
26317 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26318 ShuffleMask[0] = VT.getVectorNumElements();
26319 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26320 ShuffleMask);
26321 }
26322
26323 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26324}
26325
26327 if (!Fn->hasPersonalityFn())
26329 "querying registration node size for function without personality");
26330 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26331 // WinEHStatePass for the full struct definition.
26332 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26333 case EHPersonality::MSVC_X86SEH: return 24;
26334 case EHPersonality::MSVC_CXX: return 16;
26335 default: break;
26336 }
26338 "can only recover FP for 32-bit MSVC EH personality functions");
26339}
26340
26341/// When the MSVC runtime transfers control to us, either to an outlined
26342/// function or when returning to a parent frame after catching an exception, we
26343/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26344/// Here's the math:
26345/// RegNodeBase = EntryEBP - RegNodeSize
26346/// ParentFP = RegNodeBase - ParentFrameOffset
26347/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26348/// subtracting the offset (negative on x86) takes us back to the parent FP.
26350 SDValue EntryEBP) {
26352 SDLoc dl;
26353
26354 // It's possible that the parent function no longer has a personality function
26355 // if the exceptional code was optimized away, in which case we just return
26356 // the incoming EBP.
26357 if (!Fn->hasPersonalityFn())
26358 return EntryEBP;
26359
26360 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26361 // registration, or the .set_setframe offset.
26364 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26365 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26366 SDValue ParentFrameOffset =
26367 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26368
26369 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26370 // prologue to RBP in the parent function.
26371 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26372 if (Subtarget.is64Bit())
26373 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26374
26375 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26376 // RegNodeBase = EntryEBP - RegNodeSize
26377 // ParentFP = RegNodeBase - ParentFrameOffset
26378 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26379 DAG.getConstant(RegNodeSize, dl, PtrVT));
26380 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26381}
26382
26383SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26384 SelectionDAG &DAG) const {
26385 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26386 auto isRoundModeCurDirection = [](SDValue Rnd) {
26387 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26388 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26389
26390 return false;
26391 };
26392 auto isRoundModeSAE = [](SDValue Rnd) {
26393 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26394 unsigned RC = C->getZExtValue();
26396 // Clear the NO_EXC bit and check remaining bits.
26398 // As a convenience we allow no other bits or explicitly
26399 // current direction.
26400 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26401 }
26402 }
26403
26404 return false;
26405 };
26406 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26407 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26408 RC = C->getZExtValue();
26410 // Clear the NO_EXC bit and check remaining bits.
26416 }
26417 }
26418
26419 return false;
26420 };
26421
26422 SDLoc dl(Op);
26423 unsigned IntNo = Op.getConstantOperandVal(0);
26424 MVT VT = Op.getSimpleValueType();
26425 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26426
26427 // Propagate flags from original node to transformed node(s).
26428 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26429
26430 if (IntrData) {
26431 switch(IntrData->Type) {
26432 case INTR_TYPE_1OP: {
26433 // We specify 2 possible opcodes for intrinsics with rounding modes.
26434 // First, we check if the intrinsic may have non-default rounding mode,
26435 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26436 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26437 if (IntrWithRoundingModeOpcode != 0) {
26438 SDValue Rnd = Op.getOperand(2);
26439 unsigned RC = 0;
26440 if (isRoundModeSAEToX(Rnd, RC))
26441 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26442 Op.getOperand(1),
26443 DAG.getTargetConstant(RC, dl, MVT::i32));
26444 if (!isRoundModeCurDirection(Rnd))
26445 return SDValue();
26446 }
26447 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26448 Op.getOperand(1));
26449 }
26450 case INTR_TYPE_1OP_SAE: {
26451 SDValue Sae = Op.getOperand(2);
26452
26453 unsigned Opc;
26454 if (isRoundModeCurDirection(Sae))
26455 Opc = IntrData->Opc0;
26456 else if (isRoundModeSAE(Sae))
26457 Opc = IntrData->Opc1;
26458 else
26459 return SDValue();
26460
26461 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26462 }
26463 case INTR_TYPE_2OP: {
26464 SDValue Src2 = Op.getOperand(2);
26465
26466 // We specify 2 possible opcodes for intrinsics with rounding modes.
26467 // First, we check if the intrinsic may have non-default rounding mode,
26468 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26469 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26470 if (IntrWithRoundingModeOpcode != 0) {
26471 SDValue Rnd = Op.getOperand(3);
26472 unsigned RC = 0;
26473 if (isRoundModeSAEToX(Rnd, RC))
26474 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26475 Op.getOperand(1), Src2,
26476 DAG.getTargetConstant(RC, dl, MVT::i32));
26477 if (!isRoundModeCurDirection(Rnd))
26478 return SDValue();
26479 }
26480
26481 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26482 Op.getOperand(1), Src2);
26483 }
26484 case INTR_TYPE_2OP_SAE: {
26485 SDValue Sae = Op.getOperand(3);
26486
26487 unsigned Opc;
26488 if (isRoundModeCurDirection(Sae))
26489 Opc = IntrData->Opc0;
26490 else if (isRoundModeSAE(Sae))
26491 Opc = IntrData->Opc1;
26492 else
26493 return SDValue();
26494
26495 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26496 Op.getOperand(2));
26497 }
26498 case INTR_TYPE_3OP:
26499 case INTR_TYPE_3OP_IMM8: {
26500 SDValue Src1 = Op.getOperand(1);
26501 SDValue Src2 = Op.getOperand(2);
26502 SDValue Src3 = Op.getOperand(3);
26503
26504 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26505 Src3.getValueType() != MVT::i8) {
26506 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26507 }
26508
26509 // We specify 2 possible opcodes for intrinsics with rounding modes.
26510 // First, we check if the intrinsic may have non-default rounding mode,
26511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26513 if (IntrWithRoundingModeOpcode != 0) {
26514 SDValue Rnd = Op.getOperand(4);
26515 unsigned RC = 0;
26516 if (isRoundModeSAEToX(Rnd, RC))
26517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26518 Src1, Src2, Src3,
26519 DAG.getTargetConstant(RC, dl, MVT::i32));
26520 if (!isRoundModeCurDirection(Rnd))
26521 return SDValue();
26522 }
26523
26524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26525 {Src1, Src2, Src3});
26526 }
26527 case INTR_TYPE_4OP_IMM8: {
26528 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26529 SDValue Src4 = Op.getOperand(4);
26530 if (Src4.getValueType() != MVT::i8) {
26531 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26532 }
26533
26534 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26535 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26536 Src4);
26537 }
26538 case INTR_TYPE_1OP_MASK: {
26539 SDValue Src = Op.getOperand(1);
26540 SDValue PassThru = Op.getOperand(2);
26541 SDValue Mask = Op.getOperand(3);
26542 // We add rounding mode to the Node when
26543 // - RC Opcode is specified and
26544 // - RC is not "current direction".
26545 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26546 if (IntrWithRoundingModeOpcode != 0) {
26547 SDValue Rnd = Op.getOperand(4);
26548 unsigned RC = 0;
26549 if (isRoundModeSAEToX(Rnd, RC))
26550 return getVectorMaskingNode(
26551 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26552 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26553 Mask, PassThru, Subtarget, DAG);
26554 if (!isRoundModeCurDirection(Rnd))
26555 return SDValue();
26556 }
26557 return getVectorMaskingNode(
26558 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26559 Subtarget, DAG);
26560 }
26562 SDValue Src = Op.getOperand(1);
26563 SDValue PassThru = Op.getOperand(2);
26564 SDValue Mask = Op.getOperand(3);
26565 SDValue Rnd = Op.getOperand(4);
26566
26567 unsigned Opc;
26568 if (isRoundModeCurDirection(Rnd))
26569 Opc = IntrData->Opc0;
26570 else if (isRoundModeSAE(Rnd))
26571 Opc = IntrData->Opc1;
26572 else
26573 return SDValue();
26574
26575 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26576 Subtarget, DAG);
26577 }
26578 case INTR_TYPE_SCALAR_MASK: {
26579 SDValue Src1 = Op.getOperand(1);
26580 SDValue Src2 = Op.getOperand(2);
26581 SDValue passThru = Op.getOperand(3);
26582 SDValue Mask = Op.getOperand(4);
26583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26584 // There are 2 kinds of intrinsics in this group:
26585 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26586 // (2) With rounding mode and sae - 7 operands.
26587 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26588 if (Op.getNumOperands() == (5U + HasRounding)) {
26589 if (HasRounding) {
26590 SDValue Rnd = Op.getOperand(5);
26591 unsigned RC = 0;
26592 if (isRoundModeSAEToX(Rnd, RC))
26593 return getScalarMaskingNode(
26594 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26595 DAG.getTargetConstant(RC, dl, MVT::i32)),
26596 Mask, passThru, Subtarget, DAG);
26597 if (!isRoundModeCurDirection(Rnd))
26598 return SDValue();
26599 }
26600 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26601 Src2),
26602 Mask, passThru, Subtarget, DAG);
26603 }
26604
26605 assert(Op.getNumOperands() == (6U + HasRounding) &&
26606 "Unexpected intrinsic form");
26607 SDValue RoundingMode = Op.getOperand(5);
26608 unsigned Opc = IntrData->Opc0;
26609 if (HasRounding) {
26610 SDValue Sae = Op.getOperand(6);
26611 if (isRoundModeSAE(Sae))
26612 Opc = IntrWithRoundingModeOpcode;
26613 else if (!isRoundModeCurDirection(Sae))
26614 return SDValue();
26615 }
26616 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26617 Src2, RoundingMode),
26618 Mask, passThru, Subtarget, DAG);
26619 }
26621 SDValue Src1 = Op.getOperand(1);
26622 SDValue Src2 = Op.getOperand(2);
26623 SDValue passThru = Op.getOperand(3);
26624 SDValue Mask = Op.getOperand(4);
26625 SDValue Rnd = Op.getOperand(5);
26626
26627 SDValue NewOp;
26628 unsigned RC = 0;
26629 if (isRoundModeCurDirection(Rnd))
26630 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26631 else if (isRoundModeSAEToX(Rnd, RC))
26632 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26633 DAG.getTargetConstant(RC, dl, MVT::i32));
26634 else
26635 return SDValue();
26636
26637 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26638 }
26640 SDValue Src1 = Op.getOperand(1);
26641 SDValue Src2 = Op.getOperand(2);
26642 SDValue passThru = Op.getOperand(3);
26643 SDValue Mask = Op.getOperand(4);
26644 SDValue Sae = Op.getOperand(5);
26645 unsigned Opc;
26646 if (isRoundModeCurDirection(Sae))
26647 Opc = IntrData->Opc0;
26648 else if (isRoundModeSAE(Sae))
26649 Opc = IntrData->Opc1;
26650 else
26651 return SDValue();
26652
26653 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26654 Mask, passThru, Subtarget, DAG);
26655 }
26656 case INTR_TYPE_2OP_MASK: {
26657 SDValue Src1 = Op.getOperand(1);
26658 SDValue Src2 = Op.getOperand(2);
26659 SDValue PassThru = Op.getOperand(3);
26660 SDValue Mask = Op.getOperand(4);
26661 SDValue NewOp;
26662 if (IntrData->Opc1 != 0) {
26663 SDValue Rnd = Op.getOperand(5);
26664 unsigned RC = 0;
26665 if (isRoundModeSAEToX(Rnd, RC))
26666 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26667 DAG.getTargetConstant(RC, dl, MVT::i32));
26668 else if (!isRoundModeCurDirection(Rnd))
26669 return SDValue();
26670 }
26671 if (!NewOp)
26672 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26673 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26674 }
26676 SDValue Src1 = Op.getOperand(1);
26677 SDValue Src2 = Op.getOperand(2);
26678 SDValue PassThru = Op.getOperand(3);
26679 SDValue Mask = Op.getOperand(4);
26680
26681 unsigned Opc = IntrData->Opc0;
26682 if (IntrData->Opc1 != 0) {
26683 SDValue Sae = Op.getOperand(5);
26684 if (isRoundModeSAE(Sae))
26685 Opc = IntrData->Opc1;
26686 else if (!isRoundModeCurDirection(Sae))
26687 return SDValue();
26688 }
26689
26690 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26691 Mask, PassThru, Subtarget, DAG);
26692 }
26694 SDValue Src1 = Op.getOperand(1);
26695 SDValue Src2 = Op.getOperand(2);
26696 SDValue Src3 = Op.getOperand(3);
26697 SDValue PassThru = Op.getOperand(4);
26698 SDValue Mask = Op.getOperand(5);
26699 SDValue Sae = Op.getOperand(6);
26700 unsigned Opc;
26701 if (isRoundModeCurDirection(Sae))
26702 Opc = IntrData->Opc0;
26703 else if (isRoundModeSAE(Sae))
26704 Opc = IntrData->Opc1;
26705 else
26706 return SDValue();
26707
26708 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26709 Mask, PassThru, Subtarget, DAG);
26710 }
26712 SDValue Src1 = Op.getOperand(1);
26713 SDValue Src2 = Op.getOperand(2);
26714 SDValue Src3 = Op.getOperand(3);
26715 SDValue PassThru = Op.getOperand(4);
26716 SDValue Mask = Op.getOperand(5);
26717
26718 unsigned Opc = IntrData->Opc0;
26719 if (IntrData->Opc1 != 0) {
26720 SDValue Sae = Op.getOperand(6);
26721 if (isRoundModeSAE(Sae))
26722 Opc = IntrData->Opc1;
26723 else if (!isRoundModeCurDirection(Sae))
26724 return SDValue();
26725 }
26726 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26727 Mask, PassThru, Subtarget, DAG);
26728 }
26729 case BLENDV: {
26730 SDValue Src1 = Op.getOperand(1);
26731 SDValue Src2 = Op.getOperand(2);
26732 SDValue Src3 = Op.getOperand(3);
26733
26734 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26735 Src3 = DAG.getBitcast(MaskVT, Src3);
26736
26737 // Reverse the operands to match VSELECT order.
26738 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26739 }
26740 case VPERM_2OP : {
26741 SDValue Src1 = Op.getOperand(1);
26742 SDValue Src2 = Op.getOperand(2);
26743
26744 // Swap Src1 and Src2 in the node creation
26745 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26746 }
26747 case CFMA_OP_MASKZ:
26748 case CFMA_OP_MASK: {
26749 SDValue Src1 = Op.getOperand(1);
26750 SDValue Src2 = Op.getOperand(2);
26751 SDValue Src3 = Op.getOperand(3);
26752 SDValue Mask = Op.getOperand(4);
26753 MVT VT = Op.getSimpleValueType();
26754
26755 SDValue PassThru = Src3;
26756 if (IntrData->Type == CFMA_OP_MASKZ)
26757 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26758
26759 // We add rounding mode to the Node when
26760 // - RC Opcode is specified and
26761 // - RC is not "current direction".
26762 SDValue NewOp;
26763 if (IntrData->Opc1 != 0) {
26764 SDValue Rnd = Op.getOperand(5);
26765 unsigned RC = 0;
26766 if (isRoundModeSAEToX(Rnd, RC))
26767 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26768 DAG.getTargetConstant(RC, dl, MVT::i32));
26769 else if (!isRoundModeCurDirection(Rnd))
26770 return SDValue();
26771 }
26772 if (!NewOp)
26773 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26774 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26775 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26776 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26778 }
26779 case IFMA_OP:
26780 // NOTE: We need to swizzle the operands to pass the multiply operands
26781 // first.
26782 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26783 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26784 case FPCLASSS: {
26785 SDValue Src1 = Op.getOperand(1);
26786 SDValue Imm = Op.getOperand(2);
26787 SDValue Mask = Op.getOperand(3);
26788 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26789 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26790 Subtarget, DAG);
26791 // Need to fill with zeros to ensure the bitcast will produce zeroes
26792 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26793 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26794 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26795 DAG.getVectorIdxConstant(0, dl));
26796 return DAG.getBitcast(MVT::i8, Ins);
26797 }
26798
26799 case CMP_MASK_CC: {
26800 MVT MaskVT = Op.getSimpleValueType();
26801 SDValue CC = Op.getOperand(3);
26802 SDValue Mask = Op.getOperand(4);
26803 // We specify 2 possible opcodes for intrinsics with rounding modes.
26804 // First, we check if the intrinsic may have non-default rounding mode,
26805 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26806 if (IntrData->Opc1 != 0) {
26807 SDValue Sae = Op.getOperand(5);
26808 if (isRoundModeSAE(Sae))
26809 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26810 Op.getOperand(2), CC, Mask, Sae);
26811 if (!isRoundModeCurDirection(Sae))
26812 return SDValue();
26813 }
26814 //default rounding mode
26815 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26816 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26817 }
26818 case CMP_MASK_SCALAR_CC: {
26819 SDValue Src1 = Op.getOperand(1);
26820 SDValue Src2 = Op.getOperand(2);
26821 SDValue CC = Op.getOperand(3);
26822 SDValue Mask = Op.getOperand(4);
26823
26824 SDValue Cmp;
26825 if (IntrData->Opc1 != 0) {
26826 SDValue Sae = Op.getOperand(5);
26827 if (isRoundModeSAE(Sae))
26828 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26829 else if (!isRoundModeCurDirection(Sae))
26830 return SDValue();
26831 }
26832 //default rounding mode
26833 if (!Cmp.getNode())
26834 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26835
26836 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26837 Subtarget, DAG);
26838 // Need to fill with zeros to ensure the bitcast will produce zeroes
26839 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26840 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26841 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26842 DAG.getVectorIdxConstant(0, dl));
26843 return DAG.getBitcast(MVT::i8, Ins);
26844 }
26845 case COMI: { // Comparison intrinsics
26846 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26847 SDValue LHS = Op.getOperand(1);
26848 SDValue RHS = Op.getOperand(2);
26849 // Some conditions require the operands to be swapped.
26850 if (CC == ISD::SETLT || CC == ISD::SETLE)
26851 std::swap(LHS, RHS);
26852
26853 // For AVX10.2, Support EQ and NE.
26854 bool HasAVX10_2_COMX =
26855 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26856
26857 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26858 // For BF type we need to fall back.
26859 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26860
26861 auto ComiOpCode = IntrData->Opc0;
26862 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26863
26864 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26865 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26866
26867 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26868
26869 SDValue SetCC;
26870 switch (CC) {
26871 case ISD::SETEQ: {
26872 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26873 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26874 break;
26875 // (ZF = 1 and PF = 0)
26876 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26877 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26878 break;
26879 }
26880 case ISD::SETNE: {
26881 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26882 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26883 break;
26884 // (ZF = 0 or PF = 1)
26885 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26886 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26887 break;
26888 }
26889 case ISD::SETGT: // (CF = 0 and ZF = 0)
26890 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26891 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26892 break;
26893 }
26894 case ISD::SETGE: // CF = 0
26895 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26896 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26897 break;
26898 default:
26899 llvm_unreachable("Unexpected illegal condition!");
26900 }
26901 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26902 }
26903 case COMI_RM: { // Comparison intrinsics with Sae
26904 SDValue LHS = Op.getOperand(1);
26905 SDValue RHS = Op.getOperand(2);
26906 unsigned CondVal = Op.getConstantOperandVal(3);
26907 SDValue Sae = Op.getOperand(4);
26908
26909 SDValue FCmp;
26910 if (isRoundModeCurDirection(Sae))
26911 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26912 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26913 else if (isRoundModeSAE(Sae))
26914 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26915 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26916 else
26917 return SDValue();
26918 // Need to fill with zeros to ensure the bitcast will produce zeroes
26919 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26920 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26921 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26922 DAG.getVectorIdxConstant(0, dl));
26923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26924 DAG.getBitcast(MVT::i16, Ins));
26925 }
26926 case VSHIFT: {
26927 SDValue SrcOp = Op.getOperand(1);
26928 SDValue ShAmt = Op.getOperand(2);
26929 assert(ShAmt.getValueType() == MVT::i32 &&
26930 "Unexpected VSHIFT amount type");
26931
26932 // Catch shift-by-constant.
26933 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26934 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26935 Op.getSimpleValueType(), SrcOp,
26936 CShAmt->getZExtValue(), DAG);
26937
26938 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26939 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26940 SrcOp, ShAmt, 0, Subtarget, DAG);
26941 }
26943 SDValue Mask = Op.getOperand(3);
26944 SDValue DataToCompress = Op.getOperand(1);
26945 SDValue PassThru = Op.getOperand(2);
26946 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26947 return Op.getOperand(1);
26948
26949 // Avoid false dependency.
26950 if (PassThru.isUndef())
26951 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26952
26953 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26954 Mask);
26955 }
26956 case FIXUPIMM:
26957 case FIXUPIMM_MASKZ: {
26958 SDValue Src1 = Op.getOperand(1);
26959 SDValue Src2 = Op.getOperand(2);
26960 SDValue Src3 = Op.getOperand(3);
26961 SDValue Imm = Op.getOperand(4);
26962 SDValue Mask = Op.getOperand(5);
26963 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26964 ? Src1
26965 : getZeroVector(VT, Subtarget, DAG, dl);
26966
26967 unsigned Opc = IntrData->Opc0;
26968 if (IntrData->Opc1 != 0) {
26969 SDValue Sae = Op.getOperand(6);
26970 if (isRoundModeSAE(Sae))
26971 Opc = IntrData->Opc1;
26972 else if (!isRoundModeCurDirection(Sae))
26973 return SDValue();
26974 }
26975
26976 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26977
26979 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26980
26981 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26982 }
26983 case ROUNDP: {
26984 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26985 // Clear the upper bits of the rounding immediate so that the legacy
26986 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26987 uint64_t Round = Op.getConstantOperandVal(2);
26988 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26989 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26990 Op.getOperand(1), RoundingMode);
26991 }
26992 case ROUNDS: {
26993 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26994 // Clear the upper bits of the rounding immediate so that the legacy
26995 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26996 uint64_t Round = Op.getConstantOperandVal(3);
26997 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26998 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26999 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27000 }
27001 case BEXTRI: {
27002 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27003
27004 uint64_t Imm = Op.getConstantOperandVal(2);
27005 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27006 Op.getValueType());
27007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27008 Op.getOperand(1), Control);
27009 }
27010 // ADC/SBB
27011 case ADX: {
27012 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27013 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27014
27015 SDValue Res;
27016 // If the carry in is zero, then we should just use ADD/SUB instead of
27017 // ADC/SBB.
27018 if (isNullConstant(Op.getOperand(1))) {
27019 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27020 Op.getOperand(3));
27021 } else {
27022 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27023 DAG.getAllOnesConstant(dl, MVT::i8));
27024 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27025 Op.getOperand(3), GenCF.getValue(1));
27026 }
27027 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27028 SDValue Results[] = { SetCC, Res };
27029 return DAG.getMergeValues(Results, dl);
27030 }
27031 case CVTPD2PS_MASK:
27032 case CVTPD2DQ_MASK:
27033 case CVTQQ2PS_MASK:
27034 case TRUNCATE_TO_REG: {
27035 SDValue Src = Op.getOperand(1);
27036 SDValue PassThru = Op.getOperand(2);
27037 SDValue Mask = Op.getOperand(3);
27038
27039 if (isAllOnesConstant(Mask))
27040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27041
27042 MVT SrcVT = Src.getSimpleValueType();
27043 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27044 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27045 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27046 {Src, PassThru, Mask});
27047 }
27048 case TRUNCATE2_TO_REG: {
27049 SDValue Src = Op.getOperand(1);
27050 SDValue Src2 = Op.getOperand(2);
27051 SDValue PassThru = Op.getOperand(3);
27052 SDValue Mask = Op.getOperand(4);
27053
27054 if (isAllOnesConstant(Mask))
27055 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27056
27057 MVT Src2VT = Src2.getSimpleValueType();
27058 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27059 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27060 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27061 {Src, Src2, PassThru, Mask});
27062 }
27063 case CVTPS2PH_MASK: {
27064 SDValue Src = Op.getOperand(1);
27065 SDValue Rnd = Op.getOperand(2);
27066 SDValue PassThru = Op.getOperand(3);
27067 SDValue Mask = Op.getOperand(4);
27068
27069 unsigned RC = 0;
27070 unsigned Opc = IntrData->Opc0;
27071 bool SAE = Src.getValueType().is512BitVector() &&
27072 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27073 if (SAE) {
27075 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27076 }
27077
27078 if (isAllOnesConstant(Mask))
27079 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27080
27081 if (SAE)
27083 else
27084 Opc = IntrData->Opc1;
27085 MVT SrcVT = Src.getSimpleValueType();
27086 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27087 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27088 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27089 }
27090 case CVTNEPS2BF16_MASK: {
27091 SDValue Src = Op.getOperand(1);
27092 SDValue PassThru = Op.getOperand(2);
27093 SDValue Mask = Op.getOperand(3);
27094
27095 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27096 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27097
27098 // Break false dependency.
27099 if (PassThru.isUndef())
27100 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27101
27102 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27103 Mask);
27104 }
27105 default:
27106 break;
27107 }
27108 }
27109
27110 switch (IntNo) {
27111 default: return SDValue(); // Don't custom lower most intrinsics.
27112
27113 // ptest and testp intrinsics. The intrinsic these come from are designed to
27114 // return an integer value, not just an instruction so lower it to the ptest
27115 // or testp pattern and a setcc for the result.
27116 case Intrinsic::x86_avx512_ktestc_b:
27117 case Intrinsic::x86_avx512_ktestc_w:
27118 case Intrinsic::x86_avx512_ktestc_d:
27119 case Intrinsic::x86_avx512_ktestc_q:
27120 case Intrinsic::x86_avx512_ktestz_b:
27121 case Intrinsic::x86_avx512_ktestz_w:
27122 case Intrinsic::x86_avx512_ktestz_d:
27123 case Intrinsic::x86_avx512_ktestz_q:
27124 case Intrinsic::x86_sse41_ptestz:
27125 case Intrinsic::x86_sse41_ptestc:
27126 case Intrinsic::x86_sse41_ptestnzc:
27127 case Intrinsic::x86_avx_ptestz_256:
27128 case Intrinsic::x86_avx_ptestc_256:
27129 case Intrinsic::x86_avx_ptestnzc_256:
27130 case Intrinsic::x86_avx_vtestz_ps:
27131 case Intrinsic::x86_avx_vtestc_ps:
27132 case Intrinsic::x86_avx_vtestnzc_ps:
27133 case Intrinsic::x86_avx_vtestz_pd:
27134 case Intrinsic::x86_avx_vtestc_pd:
27135 case Intrinsic::x86_avx_vtestnzc_pd:
27136 case Intrinsic::x86_avx_vtestz_ps_256:
27137 case Intrinsic::x86_avx_vtestc_ps_256:
27138 case Intrinsic::x86_avx_vtestnzc_ps_256:
27139 case Intrinsic::x86_avx_vtestz_pd_256:
27140 case Intrinsic::x86_avx_vtestc_pd_256:
27141 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27142 unsigned TestOpc = X86ISD::PTEST;
27143 X86::CondCode X86CC;
27144 switch (IntNo) {
27145 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27146 case Intrinsic::x86_avx512_ktestc_b:
27147 case Intrinsic::x86_avx512_ktestc_w:
27148 case Intrinsic::x86_avx512_ktestc_d:
27149 case Intrinsic::x86_avx512_ktestc_q:
27150 // CF = 1
27151 TestOpc = X86ISD::KTEST;
27152 X86CC = X86::COND_B;
27153 break;
27154 case Intrinsic::x86_avx512_ktestz_b:
27155 case Intrinsic::x86_avx512_ktestz_w:
27156 case Intrinsic::x86_avx512_ktestz_d:
27157 case Intrinsic::x86_avx512_ktestz_q:
27158 TestOpc = X86ISD::KTEST;
27159 X86CC = X86::COND_E;
27160 break;
27161 case Intrinsic::x86_avx_vtestz_ps:
27162 case Intrinsic::x86_avx_vtestz_pd:
27163 case Intrinsic::x86_avx_vtestz_ps_256:
27164 case Intrinsic::x86_avx_vtestz_pd_256:
27165 TestOpc = X86ISD::TESTP;
27166 [[fallthrough]];
27167 case Intrinsic::x86_sse41_ptestz:
27168 case Intrinsic::x86_avx_ptestz_256:
27169 // ZF = 1
27170 X86CC = X86::COND_E;
27171 break;
27172 case Intrinsic::x86_avx_vtestc_ps:
27173 case Intrinsic::x86_avx_vtestc_pd:
27174 case Intrinsic::x86_avx_vtestc_ps_256:
27175 case Intrinsic::x86_avx_vtestc_pd_256:
27176 TestOpc = X86ISD::TESTP;
27177 [[fallthrough]];
27178 case Intrinsic::x86_sse41_ptestc:
27179 case Intrinsic::x86_avx_ptestc_256:
27180 // CF = 1
27181 X86CC = X86::COND_B;
27182 break;
27183 case Intrinsic::x86_avx_vtestnzc_ps:
27184 case Intrinsic::x86_avx_vtestnzc_pd:
27185 case Intrinsic::x86_avx_vtestnzc_ps_256:
27186 case Intrinsic::x86_avx_vtestnzc_pd_256:
27187 TestOpc = X86ISD::TESTP;
27188 [[fallthrough]];
27189 case Intrinsic::x86_sse41_ptestnzc:
27190 case Intrinsic::x86_avx_ptestnzc_256:
27191 // ZF and CF = 0
27192 X86CC = X86::COND_A;
27193 break;
27194 }
27195
27196 SDValue LHS = Op.getOperand(1);
27197 SDValue RHS = Op.getOperand(2);
27198 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27199 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27200 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27201 }
27202
27203 case Intrinsic::x86_sse42_pcmpistria128:
27204 case Intrinsic::x86_sse42_pcmpestria128:
27205 case Intrinsic::x86_sse42_pcmpistric128:
27206 case Intrinsic::x86_sse42_pcmpestric128:
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 case Intrinsic::x86_sse42_pcmpestrio128:
27209 case Intrinsic::x86_sse42_pcmpistris128:
27210 case Intrinsic::x86_sse42_pcmpestris128:
27211 case Intrinsic::x86_sse42_pcmpistriz128:
27212 case Intrinsic::x86_sse42_pcmpestriz128: {
27213 unsigned Opcode;
27214 X86::CondCode X86CC;
27215 switch (IntNo) {
27216 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27217 case Intrinsic::x86_sse42_pcmpistria128:
27218 Opcode = X86ISD::PCMPISTR;
27219 X86CC = X86::COND_A;
27220 break;
27221 case Intrinsic::x86_sse42_pcmpestria128:
27222 Opcode = X86ISD::PCMPESTR;
27223 X86CC = X86::COND_A;
27224 break;
27225 case Intrinsic::x86_sse42_pcmpistric128:
27226 Opcode = X86ISD::PCMPISTR;
27227 X86CC = X86::COND_B;
27228 break;
27229 case Intrinsic::x86_sse42_pcmpestric128:
27230 Opcode = X86ISD::PCMPESTR;
27231 X86CC = X86::COND_B;
27232 break;
27233 case Intrinsic::x86_sse42_pcmpistrio128:
27234 Opcode = X86ISD::PCMPISTR;
27235 X86CC = X86::COND_O;
27236 break;
27237 case Intrinsic::x86_sse42_pcmpestrio128:
27238 Opcode = X86ISD::PCMPESTR;
27239 X86CC = X86::COND_O;
27240 break;
27241 case Intrinsic::x86_sse42_pcmpistris128:
27242 Opcode = X86ISD::PCMPISTR;
27243 X86CC = X86::COND_S;
27244 break;
27245 case Intrinsic::x86_sse42_pcmpestris128:
27246 Opcode = X86ISD::PCMPESTR;
27247 X86CC = X86::COND_S;
27248 break;
27249 case Intrinsic::x86_sse42_pcmpistriz128:
27250 Opcode = X86ISD::PCMPISTR;
27251 X86CC = X86::COND_E;
27252 break;
27253 case Intrinsic::x86_sse42_pcmpestriz128:
27254 Opcode = X86ISD::PCMPESTR;
27255 X86CC = X86::COND_E;
27256 break;
27257 }
27259 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27260 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27261 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27262 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27263 }
27264
27265 case Intrinsic::x86_sse42_pcmpistri128:
27266 case Intrinsic::x86_sse42_pcmpestri128: {
27267 unsigned Opcode;
27268 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27269 Opcode = X86ISD::PCMPISTR;
27270 else
27271 Opcode = X86ISD::PCMPESTR;
27272
27274 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27275 return DAG.getNode(Opcode, dl, VTs, NewOps);
27276 }
27277
27278 case Intrinsic::x86_sse42_pcmpistrm128:
27279 case Intrinsic::x86_sse42_pcmpestrm128: {
27280 unsigned Opcode;
27281 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27282 Opcode = X86ISD::PCMPISTR;
27283 else
27284 Opcode = X86ISD::PCMPESTR;
27285
27287 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27288 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27289 }
27290
27291 case Intrinsic::eh_sjlj_lsda: {
27292 MachineFunction &MF = DAG.getMachineFunction();
27293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27294 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27295 auto &Context = MF.getContext();
27296 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27297 Twine(MF.getFunctionNumber()));
27298 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27299 DAG.getMCSymbol(S, PtrVT));
27300 }
27301
27302 case Intrinsic::x86_seh_lsda: {
27303 // Compute the symbol for the LSDA. We know it'll get emitted later.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 SDValue Op1 = Op.getOperand(1);
27306 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27309
27310 // Generate a simple absolute symbol reference. This intrinsic is only
27311 // supported on 32-bit Windows, which isn't PIC.
27312 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27313 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27314 }
27315
27316 case Intrinsic::eh_recoverfp: {
27317 SDValue FnOp = Op.getOperand(1);
27318 SDValue IncomingFPOp = Op.getOperand(2);
27319 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27320 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27321 if (!Fn)
27323 "llvm.eh.recoverfp must take a function as the first argument");
27324 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27325 }
27326
27327 case Intrinsic::localaddress: {
27328 // Returns one of the stack, base, or frame pointer registers, depending on
27329 // which is used to reference local variables.
27330 MachineFunction &MF = DAG.getMachineFunction();
27331 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27332 Register Reg;
27333 if (RegInfo->hasBasePointer(MF))
27334 Reg = RegInfo->getBaseRegister();
27335 else { // Handles the SP or FP case.
27336 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27337 if (CantUseFP)
27338 Reg = RegInfo->getPtrSizedStackRegister(MF);
27339 else
27340 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27341 }
27342 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27343 }
27344 case Intrinsic::x86_avx512_vp2intersect_q_512:
27345 case Intrinsic::x86_avx512_vp2intersect_q_256:
27346 case Intrinsic::x86_avx512_vp2intersect_q_128:
27347 case Intrinsic::x86_avx512_vp2intersect_d_512:
27348 case Intrinsic::x86_avx512_vp2intersect_d_256:
27349 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27350 SDLoc DL(Op);
27351 MVT MaskVT = Op.getSimpleValueType();
27352 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27354 Op.getOperand(1), Op.getOperand(2));
27355 SDValue Result0 =
27356 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27357 SDValue Result1 =
27358 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27359 return DAG.getMergeValues({Result0, Result1}, DL);
27360 }
27361 case Intrinsic::x86_mmx_pslli_w:
27362 case Intrinsic::x86_mmx_pslli_d:
27363 case Intrinsic::x86_mmx_pslli_q:
27364 case Intrinsic::x86_mmx_psrli_w:
27365 case Intrinsic::x86_mmx_psrli_d:
27366 case Intrinsic::x86_mmx_psrli_q:
27367 case Intrinsic::x86_mmx_psrai_w:
27368 case Intrinsic::x86_mmx_psrai_d: {
27369 SDLoc DL(Op);
27370 SDValue ShAmt = Op.getOperand(2);
27371 // If the argument is a constant, convert it to a target constant.
27372 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27373 // Clamp out of bounds shift amounts since they will otherwise be masked
27374 // to 8-bits which may make it no longer out of bounds.
27375 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27376 if (ShiftAmount == 0)
27377 return Op.getOperand(1);
27378
27379 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27380 Op.getOperand(0), Op.getOperand(1),
27381 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27382 }
27383
27384 unsigned NewIntrinsic;
27385 switch (IntNo) {
27386 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27387 case Intrinsic::x86_mmx_pslli_w:
27388 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27389 break;
27390 case Intrinsic::x86_mmx_pslli_d:
27391 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27392 break;
27393 case Intrinsic::x86_mmx_pslli_q:
27394 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27395 break;
27396 case Intrinsic::x86_mmx_psrli_w:
27397 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27398 break;
27399 case Intrinsic::x86_mmx_psrli_d:
27400 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27401 break;
27402 case Intrinsic::x86_mmx_psrli_q:
27403 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27404 break;
27405 case Intrinsic::x86_mmx_psrai_w:
27406 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27407 break;
27408 case Intrinsic::x86_mmx_psrai_d:
27409 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27410 break;
27411 }
27412
27413 // The vector shift intrinsics with scalars uses 32b shift amounts but
27414 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27415 // MMX register.
27416 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27417 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27418 DAG.getTargetConstant(NewIntrinsic, DL,
27420 Op.getOperand(1), ShAmt);
27421 }
27422 case Intrinsic::thread_pointer: {
27423 if (Subtarget.isTargetELF()) {
27424 SDLoc dl(Op);
27425 EVT PtrVT = Op.getValueType();
27426 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27428 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27429 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27430 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27431 }
27433 "Target OS doesn't support __builtin_thread_pointer() yet.");
27434 }
27435 }
27436}
27437
27439 SDValue Src, SDValue Mask, SDValue Base,
27440 SDValue Index, SDValue ScaleOp, SDValue Chain,
27441 const X86Subtarget &Subtarget) {
27442 SDLoc dl(Op);
27443 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27444 // Scale must be constant.
27445 if (!C)
27446 return SDValue();
27447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27448 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27449 TLI.getPointerTy(DAG.getDataLayout()));
27450 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27452 // If source is undef or we know it won't be used, use a zero vector
27453 // to break register dependency.
27454 // TODO: use undef instead and let BreakFalseDeps deal with it?
27455 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27456 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27457
27458 // Cast mask to an integer type.
27459 Mask = DAG.getBitcast(MaskVT, Mask);
27460
27462
27463 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27464 SDValue Res =
27466 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27467 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27468}
27469
27471 SDValue Src, SDValue Mask, SDValue Base,
27472 SDValue Index, SDValue ScaleOp, SDValue Chain,
27473 const X86Subtarget &Subtarget) {
27474 MVT VT = Op.getSimpleValueType();
27475 SDLoc dl(Op);
27476 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27477 // Scale must be constant.
27478 if (!C)
27479 return SDValue();
27480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27481 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27482 TLI.getPointerTy(DAG.getDataLayout()));
27483 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27485 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27486
27487 // We support two versions of the gather intrinsics. One with scalar mask and
27488 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27489 if (Mask.getValueType() != MaskVT)
27490 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27491
27492 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27493 // If source is undef or we know it won't be used, use a zero vector
27494 // to break register dependency.
27495 // TODO: use undef instead and let BreakFalseDeps deal with it?
27496 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27497 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27498
27500
27501 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27502 SDValue Res =
27504 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27505 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27506}
27507
27509 SDValue Src, SDValue Mask, SDValue Base,
27510 SDValue Index, SDValue ScaleOp, SDValue Chain,
27511 const X86Subtarget &Subtarget) {
27512 SDLoc dl(Op);
27513 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27514 // Scale must be constant.
27515 if (!C)
27516 return SDValue();
27517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27518 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27519 TLI.getPointerTy(DAG.getDataLayout()));
27520 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27521 Src.getSimpleValueType().getVectorNumElements());
27522 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27523
27524 // We support two versions of the scatter intrinsics. One with scalar mask and
27525 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27526 if (Mask.getValueType() != MaskVT)
27527 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27528
27530
27531 SDVTList VTs = DAG.getVTList(MVT::Other);
27532 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27533 SDValue Res =
27535 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27536 return Res;
27537}
27538
27540 SDValue Mask, SDValue Base, SDValue Index,
27541 SDValue ScaleOp, SDValue Chain,
27542 const X86Subtarget &Subtarget) {
27543 SDLoc dl(Op);
27544 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27545 // Scale must be constant.
27546 if (!C)
27547 return SDValue();
27548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27549 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27550 TLI.getPointerTy(DAG.getDataLayout()));
27551 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27552 SDValue Segment = DAG.getRegister(0, MVT::i32);
27553 MVT MaskVT =
27554 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27555 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27556 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27557 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27558 return SDValue(Res, 0);
27559}
27560
27561/// Handles the lowering of builtin intrinsics with chain that return their
27562/// value into registers EDX:EAX.
27563/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27564/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27565/// TargetOpcode.
27566/// Returns a Glue value which can be used to add extra copy-from-reg if the
27567/// expanded intrinsics implicitly defines extra registers (i.e. not just
27568/// EDX:EAX).
27570 SelectionDAG &DAG,
27571 unsigned TargetOpcode,
27572 unsigned SrcReg,
27573 const X86Subtarget &Subtarget,
27575 SDValue Chain = N->getOperand(0);
27576 SDValue Glue;
27577
27578 if (SrcReg) {
27579 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27580 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27581 Glue = Chain.getValue(1);
27582 }
27583
27584 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27585 SDValue N1Ops[] = {Chain, Glue};
27586 SDNode *N1 = DAG.getMachineNode(
27587 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27588 Chain = SDValue(N1, 0);
27589
27590 // Reads the content of XCR and returns it in registers EDX:EAX.
27591 SDValue LO, HI;
27592 if (Subtarget.is64Bit()) {
27593 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27594 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27595 LO.getValue(2));
27596 } else {
27597 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27598 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27599 LO.getValue(2));
27600 }
27601 Chain = HI.getValue(1);
27602 Glue = HI.getValue(2);
27603
27604 if (Subtarget.is64Bit()) {
27605 // Merge the two 32-bit values into a 64-bit one.
27606 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27607 DAG.getConstant(32, DL, MVT::i8));
27608 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27609 Results.push_back(Chain);
27610 return Glue;
27611 }
27612
27613 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27614 SDValue Ops[] = { LO, HI };
27615 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27616 Results.push_back(Pair);
27617 Results.push_back(Chain);
27618 return Glue;
27619}
27620
27621/// Handles the lowering of builtin intrinsics that read the time stamp counter
27622/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27623/// READCYCLECOUNTER nodes.
27624static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27625 SelectionDAG &DAG,
27626 const X86Subtarget &Subtarget,
27628 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27629 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27630 // and the EAX register is loaded with the low-order 32 bits.
27631 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27632 /* NoRegister */0, Subtarget,
27633 Results);
27634 if (Opcode != X86::RDTSCP)
27635 return;
27636
27637 SDValue Chain = Results[1];
27638 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27639 // the ECX register. Add 'ecx' explicitly to the chain.
27640 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27641 Results[1] = ecx;
27642 Results.push_back(ecx.getValue(1));
27643}
27644
27646 SelectionDAG &DAG) {
27648 SDLoc DL(Op);
27649 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27650 Results);
27651 return DAG.getMergeValues(Results, DL);
27652}
27653
27656 SDValue Chain = Op.getOperand(0);
27657 SDValue RegNode = Op.getOperand(2);
27658 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27659 if (!EHInfo)
27660 report_fatal_error("EH registrations only live in functions using WinEH");
27661
27662 // Cast the operand to an alloca, and remember the frame index.
27663 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27664 if (!FINode)
27665 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27666 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27667
27668 // Return the chain operand without making any DAG nodes.
27669 return Chain;
27670}
27671
27674 SDValue Chain = Op.getOperand(0);
27675 SDValue EHGuard = Op.getOperand(2);
27676 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27677 if (!EHInfo)
27678 report_fatal_error("EHGuard only live in functions using WinEH");
27679
27680 // Cast the operand to an alloca, and remember the frame index.
27681 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27682 if (!FINode)
27683 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27684 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27685
27686 // Return the chain operand without making any DAG nodes.
27687 return Chain;
27688}
27689
27690/// Emit Truncating Store with signed or unsigned saturation.
27691static SDValue
27692EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27693 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27694 SelectionDAG &DAG) {
27695 SDVTList VTs = DAG.getVTList(MVT::Other);
27696 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27697 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27698 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27699 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27700}
27701
27702/// Emit Masked Truncating Store with signed or unsigned saturation.
27703static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27704 const SDLoc &DL,
27705 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27706 MachineMemOperand *MMO, SelectionDAG &DAG) {
27707 SDVTList VTs = DAG.getVTList(MVT::Other);
27708 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27709 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27710 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27711}
27712
27714 const MachineFunction &MF) {
27715 if (!Subtarget.is64Bit())
27716 return false;
27717 // 64-bit targets support extended Swift async frame setup,
27718 // except for targets that use the windows 64 prologue.
27719 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27720}
27721
27723 SelectionDAG &DAG) {
27724 unsigned IntNo = Op.getConstantOperandVal(1);
27725 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27726 if (!IntrData) {
27727 switch (IntNo) {
27728
27729 case Intrinsic::swift_async_context_addr: {
27730 SDLoc dl(Op);
27731 auto &MF = DAG.getMachineFunction();
27732 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27733 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27735 X86FI->setHasSwiftAsyncContext(true);
27736 SDValue Chain = Op->getOperand(0);
27737 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27738 SDValue Result =
27739 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27740 DAG.getTargetConstant(8, dl, MVT::i32)),
27741 0);
27742 // Return { result, chain }.
27743 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27744 CopyRBP.getValue(1));
27745 } else {
27746 // No special extended frame, create or reuse an existing stack slot.
27747 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27748 if (!X86FI->getSwiftAsyncContextFrameIdx())
27749 X86FI->setSwiftAsyncContextFrameIdx(
27750 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27751 false));
27752 SDValue Result =
27753 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27754 PtrSize == 8 ? MVT::i64 : MVT::i32);
27755 // Return { result, chain }.
27756 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27757 Op->getOperand(0));
27758 }
27759 }
27760
27761 case llvm::Intrinsic::x86_seh_ehregnode:
27762 return MarkEHRegistrationNode(Op, DAG);
27763 case llvm::Intrinsic::x86_seh_ehguard:
27764 return MarkEHGuard(Op, DAG);
27765 case llvm::Intrinsic::x86_rdpkru: {
27766 SDLoc dl(Op);
27767 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27768 // Create a RDPKRU node and pass 0 to the ECX parameter.
27769 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27770 DAG.getConstant(0, dl, MVT::i32));
27771 }
27772 case llvm::Intrinsic::x86_wrpkru: {
27773 SDLoc dl(Op);
27774 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27775 // to the EDX and ECX parameters.
27776 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27777 Op.getOperand(0), Op.getOperand(2),
27778 DAG.getConstant(0, dl, MVT::i32),
27779 DAG.getConstant(0, dl, MVT::i32));
27780 }
27781 case llvm::Intrinsic::asan_check_memaccess: {
27782 // Mark this as adjustsStack because it will be lowered to a call.
27784 // Don't do anything here, we will expand these intrinsics out later.
27785 return Op;
27786 }
27787 case llvm::Intrinsic::x86_flags_read_u32:
27788 case llvm::Intrinsic::x86_flags_read_u64:
27789 case llvm::Intrinsic::x86_flags_write_u32:
27790 case llvm::Intrinsic::x86_flags_write_u64: {
27791 // We need a frame pointer because this will get lowered to a PUSH/POP
27792 // sequence.
27795 // Don't do anything here, we will expand these intrinsics out later
27796 // during FinalizeISel in EmitInstrWithCustomInserter.
27797 return Op;
27798 }
27799 case Intrinsic::x86_lwpins32:
27800 case Intrinsic::x86_lwpins64:
27801 case Intrinsic::x86_umwait:
27802 case Intrinsic::x86_tpause: {
27803 SDLoc dl(Op);
27804 SDValue Chain = Op->getOperand(0);
27805 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27806 unsigned Opcode;
27807
27808 switch (IntNo) {
27809 default: llvm_unreachable("Impossible intrinsic");
27810 case Intrinsic::x86_umwait:
27811 Opcode = X86ISD::UMWAIT;
27812 break;
27813 case Intrinsic::x86_tpause:
27814 Opcode = X86ISD::TPAUSE;
27815 break;
27816 case Intrinsic::x86_lwpins32:
27817 case Intrinsic::x86_lwpins64:
27818 Opcode = X86ISD::LWPINS;
27819 break;
27820 }
27821
27823 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27824 Op->getOperand(3), Op->getOperand(4));
27825 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27826 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27827 Operation.getValue(1));
27828 }
27829 case Intrinsic::x86_enqcmd:
27830 case Intrinsic::x86_enqcmds: {
27831 SDLoc dl(Op);
27832 SDValue Chain = Op.getOperand(0);
27833 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27834 unsigned Opcode;
27835 switch (IntNo) {
27836 default: llvm_unreachable("Impossible intrinsic!");
27837 case Intrinsic::x86_enqcmd:
27838 Opcode = X86ISD::ENQCMD;
27839 break;
27840 case Intrinsic::x86_enqcmds:
27841 Opcode = X86ISD::ENQCMDS;
27842 break;
27843 }
27844 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27845 Op.getOperand(3));
27846 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27847 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27848 Operation.getValue(1));
27849 }
27850 case Intrinsic::x86_aesenc128kl:
27851 case Intrinsic::x86_aesdec128kl:
27852 case Intrinsic::x86_aesenc256kl:
27853 case Intrinsic::x86_aesdec256kl: {
27854 SDLoc DL(Op);
27855 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27856 SDValue Chain = Op.getOperand(0);
27857 unsigned Opcode;
27858
27859 switch (IntNo) {
27860 default: llvm_unreachable("Impossible intrinsic");
27861 case Intrinsic::x86_aesenc128kl:
27862 Opcode = X86ISD::AESENC128KL;
27863 break;
27864 case Intrinsic::x86_aesdec128kl:
27865 Opcode = X86ISD::AESDEC128KL;
27866 break;
27867 case Intrinsic::x86_aesenc256kl:
27868 Opcode = X86ISD::AESENC256KL;
27869 break;
27870 case Intrinsic::x86_aesdec256kl:
27871 Opcode = X86ISD::AESDEC256KL;
27872 break;
27873 }
27874
27876 MachineMemOperand *MMO = MemIntr->getMemOperand();
27877 EVT MemVT = MemIntr->getMemoryVT();
27879 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27880 MMO);
27881 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27882
27883 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27884 {ZF, Operation.getValue(0), Operation.getValue(2)});
27885 }
27886 case Intrinsic::x86_aesencwide128kl:
27887 case Intrinsic::x86_aesdecwide128kl:
27888 case Intrinsic::x86_aesencwide256kl:
27889 case Intrinsic::x86_aesdecwide256kl: {
27890 SDLoc DL(Op);
27891 SDVTList VTs = DAG.getVTList(
27892 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27893 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27894 SDValue Chain = Op.getOperand(0);
27895 unsigned Opcode;
27896
27897 switch (IntNo) {
27898 default: llvm_unreachable("Impossible intrinsic");
27899 case Intrinsic::x86_aesencwide128kl:
27900 Opcode = X86ISD::AESENCWIDE128KL;
27901 break;
27902 case Intrinsic::x86_aesdecwide128kl:
27903 Opcode = X86ISD::AESDECWIDE128KL;
27904 break;
27905 case Intrinsic::x86_aesencwide256kl:
27906 Opcode = X86ISD::AESENCWIDE256KL;
27907 break;
27908 case Intrinsic::x86_aesdecwide256kl:
27909 Opcode = X86ISD::AESDECWIDE256KL;
27910 break;
27911 }
27912
27914 MachineMemOperand *MMO = MemIntr->getMemOperand();
27915 EVT MemVT = MemIntr->getMemoryVT();
27917 Opcode, DL, VTs,
27918 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27919 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27920 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27921 MemVT, MMO);
27922 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27923
27924 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27925 {ZF, Operation.getValue(1), Operation.getValue(2),
27926 Operation.getValue(3), Operation.getValue(4),
27927 Operation.getValue(5), Operation.getValue(6),
27928 Operation.getValue(7), Operation.getValue(8),
27929 Operation.getValue(9)});
27930 }
27931 case Intrinsic::x86_testui: {
27932 SDLoc dl(Op);
27933 SDValue Chain = Op.getOperand(0);
27934 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27935 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27936 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27937 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27938 Operation.getValue(1));
27939 }
27940 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27941 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27943 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27944 case Intrinsic::x86_t2rpntlvwz0_internal:
27945 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1_internal:
27947 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27948 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27950 unsigned IntNo = Op.getConstantOperandVal(1);
27951 unsigned Opc = 0;
27952 switch (IntNo) {
27953 default:
27954 llvm_unreachable("Unexpected intrinsic!");
27955 case Intrinsic::x86_t2rpntlvwz0_internal:
27956 Opc = X86::PT2RPNTLVWZ0V;
27957 break;
27958 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27959 Opc = X86::PT2RPNTLVWZ0T1V;
27960 break;
27961 case Intrinsic::x86_t2rpntlvwz1_internal:
27962 Opc = X86::PT2RPNTLVWZ1V;
27963 break;
27964 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27965 Opc = X86::PT2RPNTLVWZ1T1V;
27966 break;
27967 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27968 Opc = X86::PT2RPNTLVWZ0RSV;
27969 break;
27970 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27971 Opc = X86::PT2RPNTLVWZ0RST1V;
27972 break;
27973 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27974 Opc = X86::PT2RPNTLVWZ1RSV;
27975 break;
27976 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27977 Opc = X86::PT2RPNTLVWZ1RST1V;
27978 break;
27979 }
27980
27981 SDLoc DL(Op);
27982 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27983
27984 SDValue Ops[] = {Op.getOperand(2), // Row
27985 Op.getOperand(3), // Col0
27986 Op.getOperand(4), // Col1
27987 Op.getOperand(5), // Base
27988 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27989 Op.getOperand(6), // Index
27990 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27991 DAG.getRegister(0, MVT::i16), // Segment
27992 Op.getOperand(0)}; // Chain
27993
27994 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27995 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27996 SDValue(Res, 0));
27997 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27998 SDValue(Res, 0));
27999 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28000 }
28001 case Intrinsic::x86_atomic_bts_rm:
28002 case Intrinsic::x86_atomic_btc_rm:
28003 case Intrinsic::x86_atomic_btr_rm: {
28004 SDLoc DL(Op);
28005 MVT VT = Op.getSimpleValueType();
28006 SDValue Chain = Op.getOperand(0);
28007 SDValue Op1 = Op.getOperand(2);
28008 SDValue Op2 = Op.getOperand(3);
28009 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28010 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28012 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28013 SDValue Res =
28014 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28015 {Chain, Op1, Op2}, VT, MMO);
28016 Chain = Res.getValue(1);
28017 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28018 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28019 }
28020 case Intrinsic::x86_atomic_bts:
28021 case Intrinsic::x86_atomic_btc:
28022 case Intrinsic::x86_atomic_btr: {
28023 SDLoc DL(Op);
28024 MVT VT = Op.getSimpleValueType();
28025 SDValue Chain = Op.getOperand(0);
28026 SDValue Op1 = Op.getOperand(2);
28027 SDValue Op2 = Op.getOperand(3);
28028 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28029 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28030 : X86ISD::LBTR;
28031 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28032 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28033 SDValue Res =
28034 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28035 {Chain, Op1, Op2, Size}, VT, MMO);
28036 Chain = Res.getValue(1);
28037 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28038 unsigned Imm = Op2->getAsZExtVal();
28039 if (Imm)
28040 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28041 DAG.getShiftAmountConstant(Imm, VT, DL));
28042 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28043 }
28044 case Intrinsic::x86_cmpccxadd32:
28045 case Intrinsic::x86_cmpccxadd64: {
28046 SDLoc DL(Op);
28047 SDValue Chain = Op.getOperand(0);
28048 SDValue Addr = Op.getOperand(2);
28049 SDValue Src1 = Op.getOperand(3);
28050 SDValue Src2 = Op.getOperand(4);
28051 SDValue CC = Op.getOperand(5);
28052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28054 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28055 MVT::i32, MMO);
28056 return Operation;
28057 }
28058 case Intrinsic::x86_aadd32:
28059 case Intrinsic::x86_aadd64:
28060 case Intrinsic::x86_aand32:
28061 case Intrinsic::x86_aand64:
28062 case Intrinsic::x86_aor32:
28063 case Intrinsic::x86_aor64:
28064 case Intrinsic::x86_axor32:
28065 case Intrinsic::x86_axor64: {
28066 SDLoc DL(Op);
28067 SDValue Chain = Op.getOperand(0);
28068 SDValue Op1 = Op.getOperand(2);
28069 SDValue Op2 = Op.getOperand(3);
28070 MVT VT = Op2.getSimpleValueType();
28071 unsigned Opc = 0;
28072 switch (IntNo) {
28073 default:
28074 llvm_unreachable("Unknown Intrinsic");
28075 case Intrinsic::x86_aadd32:
28076 case Intrinsic::x86_aadd64:
28077 Opc = X86ISD::AADD;
28078 break;
28079 case Intrinsic::x86_aand32:
28080 case Intrinsic::x86_aand64:
28081 Opc = X86ISD::AAND;
28082 break;
28083 case Intrinsic::x86_aor32:
28084 case Intrinsic::x86_aor64:
28085 Opc = X86ISD::AOR;
28086 break;
28087 case Intrinsic::x86_axor32:
28088 case Intrinsic::x86_axor64:
28089 Opc = X86ISD::AXOR;
28090 break;
28091 }
28092 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28094 {Chain, Op1, Op2}, VT, MMO);
28095 }
28096 case Intrinsic::x86_atomic_add_cc:
28097 case Intrinsic::x86_atomic_sub_cc:
28098 case Intrinsic::x86_atomic_or_cc:
28099 case Intrinsic::x86_atomic_and_cc:
28100 case Intrinsic::x86_atomic_xor_cc: {
28101 SDLoc DL(Op);
28102 SDValue Chain = Op.getOperand(0);
28103 SDValue Op1 = Op.getOperand(2);
28104 SDValue Op2 = Op.getOperand(3);
28105 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28106 MVT VT = Op2.getSimpleValueType();
28107 unsigned Opc = 0;
28108 switch (IntNo) {
28109 default:
28110 llvm_unreachable("Unknown Intrinsic");
28111 case Intrinsic::x86_atomic_add_cc:
28112 Opc = X86ISD::LADD;
28113 break;
28114 case Intrinsic::x86_atomic_sub_cc:
28115 Opc = X86ISD::LSUB;
28116 break;
28117 case Intrinsic::x86_atomic_or_cc:
28118 Opc = X86ISD::LOR;
28119 break;
28120 case Intrinsic::x86_atomic_and_cc:
28121 Opc = X86ISD::LAND;
28122 break;
28123 case Intrinsic::x86_atomic_xor_cc:
28124 Opc = X86ISD::LXOR;
28125 break;
28126 }
28127 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28128 SDValue LockArith =
28129 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28130 {Chain, Op1, Op2}, VT, MMO);
28131 Chain = LockArith.getValue(1);
28132 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28133 }
28134 }
28135 return SDValue();
28136 }
28137
28138 SDLoc dl(Op);
28139 switch(IntrData->Type) {
28140 default: llvm_unreachable("Unknown Intrinsic Type");
28141 case RDSEED:
28142 case RDRAND: {
28143 // Emit the node with the right value type.
28144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28146
28147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28148 // Otherwise return the value from Rand, which is always 0, casted to i32.
28149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28150 DAG.getConstant(1, dl, Op->getValueType(1)),
28151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28152 SDValue(Result.getNode(), 1)};
28153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28154
28155 // Return { result, isValid, chain }.
28156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28157 SDValue(Result.getNode(), 2));
28158 }
28159 case GATHER_AVX2: {
28160 SDValue Chain = Op.getOperand(0);
28161 SDValue Src = Op.getOperand(2);
28162 SDValue Base = Op.getOperand(3);
28163 SDValue Index = Op.getOperand(4);
28164 SDValue Mask = Op.getOperand(5);
28165 SDValue Scale = Op.getOperand(6);
28166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28167 Scale, Chain, Subtarget);
28168 }
28169 case GATHER: {
28170 //gather(v1, mask, index, base, scale);
28171 SDValue Chain = Op.getOperand(0);
28172 SDValue Src = Op.getOperand(2);
28173 SDValue Base = Op.getOperand(3);
28174 SDValue Index = Op.getOperand(4);
28175 SDValue Mask = Op.getOperand(5);
28176 SDValue Scale = Op.getOperand(6);
28177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28178 Chain, Subtarget);
28179 }
28180 case SCATTER: {
28181 //scatter(base, mask, index, v1, scale);
28182 SDValue Chain = Op.getOperand(0);
28183 SDValue Base = Op.getOperand(2);
28184 SDValue Mask = Op.getOperand(3);
28185 SDValue Index = Op.getOperand(4);
28186 SDValue Src = Op.getOperand(5);
28187 SDValue Scale = Op.getOperand(6);
28188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28189 Scale, Chain, Subtarget);
28190 }
28191 case PREFETCH: {
28192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28193 assert((HintVal == 2 || HintVal == 3) &&
28194 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28196 SDValue Chain = Op.getOperand(0);
28197 SDValue Mask = Op.getOperand(2);
28198 SDValue Index = Op.getOperand(3);
28199 SDValue Base = Op.getOperand(4);
28200 SDValue Scale = Op.getOperand(5);
28201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28202 Subtarget);
28203 }
28204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28205 case RDTSC: {
28207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28208 Results);
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 // Read Performance Monitoring Counters.
28212 case RDPMC:
28213 // Read Processor Register.
28214 case RDPRU:
28215 // GetExtended Control Register.
28216 case XGETBV: {
28218
28219 // RDPMC uses ECX to select the index of the performance counter to read.
28220 // RDPRU uses ECX to select the processor register to read.
28221 // XGETBV uses ECX to select the index of the XCR register to return.
28222 // The result is stored into registers EDX:EAX.
28223 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28224 Subtarget, Results);
28225 return DAG.getMergeValues(Results, dl);
28226 }
28227 // XTEST intrinsics.
28228 case XTEST: {
28229 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28230 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28231
28232 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28233 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28235 Ret, SDValue(InTrans.getNode(), 1));
28236 }
28239 case TRUNCATE_TO_MEM_VI32: {
28240 SDValue Mask = Op.getOperand(4);
28241 SDValue DataToTruncate = Op.getOperand(3);
28242 SDValue Addr = Op.getOperand(2);
28243 SDValue Chain = Op.getOperand(0);
28244
28246 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28247
28248 EVT MemVT = MemIntr->getMemoryVT();
28249
28250 uint16_t TruncationOp = IntrData->Opc0;
28251 switch (TruncationOp) {
28252 case X86ISD::VTRUNC: {
28253 if (isAllOnesConstant(Mask)) // return just a truncate store
28254 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28255 MemIntr->getMemOperand());
28256
28257 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28258 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28259 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28260
28261 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28262 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28263 true /* truncating */);
28264 }
28265 case X86ISD::VTRUNCUS:
28266 case X86ISD::VTRUNCS: {
28267 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28268 if (isAllOnesConstant(Mask))
28269 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28270 MemIntr->getMemOperand(), DAG);
28271
28272 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28273 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28274
28275 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28276 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28277 }
28278 default:
28279 llvm_unreachable("Unsupported truncstore intrinsic");
28280 }
28281 }
28282 case INTR_TYPE_CAST_MMX:
28283 return SDValue(); // handled in combineINTRINSIC_*
28284 }
28285}
28286
28287SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28288 SelectionDAG &DAG) const {
28289 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28290 MFI.setReturnAddressIsTaken(true);
28291
28292 unsigned Depth = Op.getConstantOperandVal(0);
28293 SDLoc dl(Op);
28294 EVT PtrVT = Op.getValueType();
28295
28296 if (Depth > 0) {
28297 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28298 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28299 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28300 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28301 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28302 MachinePointerInfo());
28303 }
28304
28305 // Just load the return address.
28306 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28307 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28308 MachinePointerInfo());
28309}
28310
28311SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28312 SelectionDAG &DAG) const {
28314 return getReturnAddressFrameIndex(DAG);
28315}
28316
28317SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28318 MachineFunction &MF = DAG.getMachineFunction();
28319 MachineFrameInfo &MFI = MF.getFrameInfo();
28320 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28321 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28322 EVT VT = Op.getValueType();
28323
28324 MFI.setFrameAddressIsTaken(true);
28325
28326 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28327 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28328 // is not possible to crawl up the stack without looking at the unwind codes
28329 // simultaneously.
28330 int FrameAddrIndex = FuncInfo->getFAIndex();
28331 if (!FrameAddrIndex) {
28332 // Set up a frame object for the return address.
28333 unsigned SlotSize = RegInfo->getSlotSize();
28334 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28335 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28336 FuncInfo->setFAIndex(FrameAddrIndex);
28337 }
28338 return DAG.getFrameIndex(FrameAddrIndex, VT);
28339 }
28340
28341 Register FrameReg =
28342 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28343 SDLoc dl(Op); // FIXME probably not meaningful
28344 unsigned Depth = Op.getConstantOperandVal(0);
28345 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28346 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28347 "Invalid Frame Register!");
28348 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28349 while (Depth--)
28350 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28351 MachinePointerInfo());
28352 return FrameAddr;
28353}
28354
28355// FIXME? Maybe this could be a TableGen attribute on some registers and
28356// this table could be generated automatically from RegInfo.
28358 const MachineFunction &MF) const {
28359 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28360
28362 .Case("esp", X86::ESP)
28363 .Case("rsp", X86::RSP)
28364 .Case("ebp", X86::EBP)
28365 .Case("rbp", X86::RBP)
28366 .Case("r14", X86::R14)
28367 .Case("r15", X86::R15)
28368 .Default(0);
28369
28370 if (Reg == X86::EBP || Reg == X86::RBP) {
28371 if (!TFI.hasFP(MF))
28372 report_fatal_error("register " + StringRef(RegName) +
28373 " is allocatable: function has no frame pointer");
28374#ifndef NDEBUG
28375 else {
28376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28379 "Invalid Frame Register!");
28380 }
28381#endif
28382 }
28383
28384 return Reg;
28385}
28386
28387SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28388 SelectionDAG &DAG) const {
28389 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28390 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28391}
28392
28394 const Constant *PersonalityFn) const {
28395 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28396 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28397
28398 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28399}
28400
28402 const Constant *PersonalityFn) const {
28403 // Funclet personalities don't use selectors (the runtime does the selection).
28405 return X86::NoRegister;
28406 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28407}
28408
28410 return Subtarget.isTargetWin64();
28411}
28412
28413SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28414 SDValue Chain = Op.getOperand(0);
28415 SDValue Offset = Op.getOperand(1);
28416 SDValue Handler = Op.getOperand(2);
28417 SDLoc dl (Op);
28418
28419 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28421 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28422 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28423 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28424 "Invalid Frame Register!");
28425 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28426 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28427
28428 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28429 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28430 dl));
28431 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28432 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28433 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28434
28435 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28436 DAG.getRegister(StoreAddrReg, PtrVT));
28437}
28438
28439SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28440 SelectionDAG &DAG) const {
28441 SDLoc DL(Op);
28442 // If the subtarget is not 64bit, we may need the global base reg
28443 // after isel expand pseudo, i.e., after CGBR pass ran.
28444 // Therefore, ask for the GlobalBaseReg now, so that the pass
28445 // inserts the code for us in case we need it.
28446 // Otherwise, we will end up in a situation where we will
28447 // reference a virtual register that is not defined!
28448 if (!Subtarget.is64Bit()) {
28449 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28450 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28451 }
28452 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28453 DAG.getVTList(MVT::i32, MVT::Other),
28454 Op.getOperand(0), Op.getOperand(1));
28455}
28456
28457SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28458 SelectionDAG &DAG) const {
28459 SDLoc DL(Op);
28460 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28461 Op.getOperand(0), Op.getOperand(1));
28462}
28463
28464SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28465 SelectionDAG &DAG) const {
28466 SDLoc DL(Op);
28467 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28468 Op.getOperand(0));
28469}
28470
28472 return Op.getOperand(0);
28473}
28474
28475SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28476 SelectionDAG &DAG) const {
28477 SDValue Root = Op.getOperand(0);
28478 SDValue Trmp = Op.getOperand(1); // trampoline
28479 SDValue FPtr = Op.getOperand(2); // nested function
28480 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28481 SDLoc dl (Op);
28482
28483 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28484 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28485
28486 if (Subtarget.is64Bit()) {
28487 SDValue OutChains[6];
28488
28489 // Large code-model.
28490 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28491 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28492
28493 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28494 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28495
28496 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28497
28498 // Load the pointer to the nested function into R11.
28499 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28500 SDValue Addr = Trmp;
28501 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28502 Addr, MachinePointerInfo(TrmpAddr));
28503
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(2, dl, MVT::i64));
28506 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28507 MachinePointerInfo(TrmpAddr, 2), Align(2));
28508
28509 // Load the 'nest' parameter value into R10.
28510 // R10 is specified in X86CallingConv.td
28511 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28512 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28513 DAG.getConstant(10, dl, MVT::i64));
28514 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28515 Addr, MachinePointerInfo(TrmpAddr, 10));
28516
28517 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28518 DAG.getConstant(12, dl, MVT::i64));
28519 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28520 MachinePointerInfo(TrmpAddr, 12), Align(2));
28521
28522 // Jump to the nested function.
28523 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28525 DAG.getConstant(20, dl, MVT::i64));
28526 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28527 Addr, MachinePointerInfo(TrmpAddr, 20));
28528
28529 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28530 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28531 DAG.getConstant(22, dl, MVT::i64));
28532 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28533 Addr, MachinePointerInfo(TrmpAddr, 22));
28534
28535 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28536 } else {
28537 const Function *Func =
28538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28539 CallingConv::ID CC = Func->getCallingConv();
28540 unsigned NestReg;
28541
28542 switch (CC) {
28543 default:
28544 llvm_unreachable("Unsupported calling convention");
28545 case CallingConv::C:
28547 // Pass 'nest' parameter in ECX.
28548 // Must be kept in sync with X86CallingConv.td
28549 NestReg = X86::ECX;
28550
28551 // Check that ECX wasn't needed by an 'inreg' parameter.
28552 FunctionType *FTy = Func->getFunctionType();
28553 const AttributeList &Attrs = Func->getAttributes();
28554
28555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28556 unsigned InRegCount = 0;
28557 unsigned Idx = 0;
28558
28559 for (FunctionType::param_iterator I = FTy->param_begin(),
28560 E = FTy->param_end(); I != E; ++I, ++Idx)
28561 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28562 const DataLayout &DL = DAG.getDataLayout();
28563 // FIXME: should only count parameters that are lowered to integers.
28564 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28565 }
28566
28567 if (InRegCount > 2) {
28568 report_fatal_error("Nest register in use - reduce number of inreg"
28569 " parameters!");
28570 }
28571 }
28572 break;
28573 }
28576 case CallingConv::Fast:
28577 case CallingConv::Tail:
28579 // Pass 'nest' parameter in EAX.
28580 // Must be kept in sync with X86CallingConv.td
28581 NestReg = X86::EAX;
28582 break;
28583 }
28584
28585 SDValue OutChains[4];
28586 SDValue Addr, Disp;
28587
28588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28589 DAG.getConstant(10, dl, MVT::i32));
28590 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28591
28592 // This is storing the opcode for MOV32ri.
28593 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28594 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28595 OutChains[0] =
28596 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28597 Trmp, MachinePointerInfo(TrmpAddr));
28598
28599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28600 DAG.getConstant(1, dl, MVT::i32));
28601 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28602 MachinePointerInfo(TrmpAddr, 1), Align(1));
28603
28604 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28606 DAG.getConstant(5, dl, MVT::i32));
28607 OutChains[2] =
28608 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28609 MachinePointerInfo(TrmpAddr, 5), Align(1));
28610
28611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28612 DAG.getConstant(6, dl, MVT::i32));
28613 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28614 MachinePointerInfo(TrmpAddr, 6), Align(1));
28615
28616 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28617 }
28618}
28619
28620SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28621 SelectionDAG &DAG) const {
28622 /*
28623 The rounding mode is in bits 11:10 of FPSR, and has the following
28624 settings:
28625 00 Round to nearest
28626 01 Round to -inf
28627 10 Round to +inf
28628 11 Round to 0
28629
28630 GET_ROUNDING, on the other hand, expects the following:
28631 -1 Undefined
28632 0 Round to 0
28633 1 Round to nearest
28634 2 Round to +inf
28635 3 Round to -inf
28636
28637 To perform the conversion, we use a packed lookup table of the four 2-bit
28638 values that we can index by FPSP[11:10]
28639 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28640
28641 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28642 */
28643
28644 MachineFunction &MF = DAG.getMachineFunction();
28645 MVT VT = Op.getSimpleValueType();
28646 SDLoc DL(Op);
28647
28648 // Save FP Control Word to stack slot
28649 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28650 SDValue StackSlot =
28651 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28652
28653 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28654
28655 SDValue Chain = Op.getOperand(0);
28656 SDValue Ops[] = {Chain, StackSlot};
28658 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28660
28661 // Load FP Control Word from stack slot
28662 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28663 Chain = CWD.getValue(1);
28664
28665 // Mask and turn the control bits into a shift for the lookup table.
28666 SDValue Shift =
28667 DAG.getNode(ISD::SRL, DL, MVT::i16,
28668 DAG.getNode(ISD::AND, DL, MVT::i16,
28669 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28670 DAG.getConstant(9, DL, MVT::i8));
28671 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28672
28673 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28674 SDValue RetVal =
28675 DAG.getNode(ISD::AND, DL, MVT::i32,
28676 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28677 DAG.getConstant(3, DL, MVT::i32));
28678
28679 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28680
28681 return DAG.getMergeValues({RetVal, Chain}, DL);
28682}
28683
28684SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28685 SelectionDAG &DAG) const {
28686 MachineFunction &MF = DAG.getMachineFunction();
28687 SDLoc DL(Op);
28688 SDValue Chain = Op.getNode()->getOperand(0);
28689
28690 // FP control word may be set only from data in memory. So we need to allocate
28691 // stack space to save/load FP control word.
28692 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28693 SDValue StackSlot =
28694 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28695 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28696 MachineMemOperand *MMO =
28698
28699 // Store FP control word into memory.
28700 SDValue Ops[] = {Chain, StackSlot};
28701 Chain = DAG.getMemIntrinsicNode(
28702 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28703
28704 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28705 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28706 Chain = CWD.getValue(1);
28707 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28708 DAG.getConstant(0xf3ff, DL, MVT::i16));
28709
28710 // Calculate new rounding mode.
28711 SDValue NewRM = Op.getNode()->getOperand(1);
28712 SDValue RMBits;
28713 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28714 uint64_t RM = CVal->getZExtValue();
28715 int FieldVal = X86::getRoundingModeX86(RM);
28716
28717 if (FieldVal == X86::rmInvalid) {
28718 FieldVal = X86::rmToNearest;
28719 LLVMContext &C = MF.getFunction().getContext();
28720 C.diagnose(DiagnosticInfoUnsupported(
28721 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28722 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28723 }
28724 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28725 } else {
28726 // Need to convert argument into bits of control word:
28727 // 0 Round to 0 -> 11
28728 // 1 Round to nearest -> 00
28729 // 2 Round to +inf -> 10
28730 // 3 Round to -inf -> 01
28731 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28732 // To make the conversion, put all these values into a value 0xc9 and shift
28733 // it left depending on the rounding mode:
28734 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28735 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28736 // ...
28737 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28738 SDValue ShiftValue =
28739 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28740 DAG.getNode(ISD::ADD, DL, MVT::i32,
28741 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28742 DAG.getConstant(1, DL, MVT::i8)),
28743 DAG.getConstant(4, DL, MVT::i32)));
28744 SDValue Shifted =
28745 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28746 ShiftValue);
28747 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28748 DAG.getConstant(0xc00, DL, MVT::i16));
28749 }
28750
28751 // Update rounding mode bits and store the new FP Control Word into stack.
28752 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28753 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28754
28755 // Load FP control word from the slot.
28756 SDValue OpsLD[] = {Chain, StackSlot};
28757 MachineMemOperand *MMOL =
28759 Chain = DAG.getMemIntrinsicNode(
28760 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28761
28762 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28763 // same way but in bits 14:13.
28764 if (Subtarget.hasSSE1()) {
28765 // Store MXCSR into memory.
28766 Chain = DAG.getNode(
28767 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28768 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28769 StackSlot);
28770
28771 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28772 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28773 Chain = CWD.getValue(1);
28774 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28775 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28776
28777 // Shift X87 RM bits from 11:10 to 14:13.
28778 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28779 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28780 DAG.getConstant(3, DL, MVT::i8));
28781
28782 // Update rounding mode bits and store the new FP Control Word into stack.
28783 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28784 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28785
28786 // Load MXCSR from the slot.
28787 Chain = DAG.getNode(
28788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28789 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28790 StackSlot);
28791 }
28792
28793 return Chain;
28794}
28795
28796const unsigned X87StateSize = 28;
28797const unsigned FPStateSize = 32;
28798[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28799
28800SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28801 SelectionDAG &DAG) const {
28803 SDLoc DL(Op);
28804 SDValue Chain = Op->getOperand(0);
28805 SDValue Ptr = Op->getOperand(1);
28807 EVT MemVT = Node->getMemoryVT();
28809 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28810
28811 // Get x87 state, if it presents.
28812 if (Subtarget.hasX87()) {
28813 Chain =
28814 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28815 {Chain, Ptr}, MemVT, MMO);
28816
28817 // FNSTENV changes the exception mask, so load back the stored environment.
28818 MachineMemOperand::Flags NewFlags =
28821 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28822 Chain =
28823 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28824 {Chain, Ptr}, MemVT, MMO);
28825 }
28826
28827 // If target supports SSE, get MXCSR as well.
28828 if (Subtarget.hasSSE1()) {
28829 // Get pointer to the MXCSR location in memory.
28831 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28832 DAG.getConstant(X87StateSize, DL, PtrVT));
28833 // Store MXCSR into memory.
28834 Chain = DAG.getNode(
28835 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28836 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28837 MXCSRAddr);
28838 }
28839
28840 return Chain;
28841}
28842
28844 EVT MemVT, MachineMemOperand *MMO,
28845 SelectionDAG &DAG,
28846 const X86Subtarget &Subtarget) {
28847 // Set x87 state, if it presents.
28848 if (Subtarget.hasX87())
28849 Chain =
28850 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28851 {Chain, Ptr}, MemVT, MMO);
28852 // If target supports SSE, set MXCSR as well.
28853 if (Subtarget.hasSSE1()) {
28854 // Get pointer to the MXCSR location in memory.
28856 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28857 DAG.getConstant(X87StateSize, DL, PtrVT));
28858 // Load MXCSR from memory.
28859 Chain = DAG.getNode(
28860 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28861 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28862 MXCSRAddr);
28863 }
28864 return Chain;
28865}
28866
28867SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28868 SelectionDAG &DAG) const {
28869 SDLoc DL(Op);
28870 SDValue Chain = Op->getOperand(0);
28871 SDValue Ptr = Op->getOperand(1);
28873 EVT MemVT = Node->getMemoryVT();
28875 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28876 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28877}
28878
28879SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28880 SelectionDAG &DAG) const {
28881 MachineFunction &MF = DAG.getMachineFunction();
28882 SDLoc DL(Op);
28883 SDValue Chain = Op.getNode()->getOperand(0);
28884
28885 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28886 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28888
28889 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28890 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28891 // for compatibility with glibc.
28892 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28893 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28894 Constant *Zero = ConstantInt::get(ItemTy, 0);
28895 for (unsigned I = 0; I < 6; ++I)
28896 FPEnvVals.push_back(Zero);
28897
28898 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28899 // all exceptions, sets DAZ and FTZ to 0.
28900 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28901 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28902 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28903 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28904 MachinePointerInfo MPI =
28906 MachineMemOperand *MMO = MF.getMachineMemOperand(
28908
28909 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28910}
28911
28912// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28913uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28914 assert((Amt < 8) && "Shift/Rotation amount out of range");
28915 switch (Opcode) {
28916 case ISD::BITREVERSE:
28917 return 0x8040201008040201ULL;
28918 case ISD::SHL:
28919 return ((0x0102040810204080ULL >> (Amt)) &
28920 (0x0101010101010101ULL * (0xFF >> (Amt))));
28921 case ISD::SRL:
28922 return ((0x0102040810204080ULL << (Amt)) &
28923 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28924 case ISD::SRA:
28925 return (getGFNICtrlImm(ISD::SRL, Amt) |
28926 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28927 case ISD::ROTL:
28928 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28929 case ISD::ROTR:
28930 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28931 }
28932 llvm_unreachable("Unsupported GFNI opcode");
28933}
28934
28935// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28936SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28937 MVT VT, unsigned Amt = 0) {
28938 assert(VT.getVectorElementType() == MVT::i8 &&
28939 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28940 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28941 SmallVector<SDValue> MaskBits;
28942 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28943 uint64_t Bits = (Imm >> (I % 64)) & 255;
28944 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28945 }
28946 return DAG.getBuildVector(VT, DL, MaskBits);
28947}
28948
28949/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28950//
28951// i8/i16 vector implemented using dword LZCNT vector instruction
28952// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28953// split the vector, perform operation on it's Lo a Hi part and
28954// concatenate the results.
28956 const X86Subtarget &Subtarget) {
28957 assert(Op.getOpcode() == ISD::CTLZ);
28958 SDLoc dl(Op);
28959 MVT VT = Op.getSimpleValueType();
28960 MVT EltVT = VT.getVectorElementType();
28961 unsigned NumElems = VT.getVectorNumElements();
28962
28963 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28964 "Unsupported element type");
28965
28966 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28967 if (NumElems > 16 ||
28968 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28969 return splitVectorIntUnary(Op, DAG, dl);
28970
28971 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28972 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28973 "Unsupported value type for operation");
28974
28975 // Use native supported vector instruction vplzcntd.
28976 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28977 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28978 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28979 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28980
28981 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28982}
28983
28984// Lower CTLZ using a PSHUFB lookup table implementation.
28986 const X86Subtarget &Subtarget,
28987 SelectionDAG &DAG) {
28988 MVT VT = Op.getSimpleValueType();
28989 int NumElts = VT.getVectorNumElements();
28990 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28991 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28992
28993 // Per-nibble leading zero PSHUFB lookup table.
28994 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28995 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28996 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28997 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28998
29000 for (int i = 0; i < NumBytes; ++i)
29001 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29002 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29003
29004 // Begin by bitcasting the input to byte vector, then split those bytes
29005 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29006 // If the hi input nibble is zero then we add both results together, otherwise
29007 // we just take the hi result (by masking the lo result to zero before the
29008 // add).
29009 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29010 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29011
29012 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29013 SDValue Lo = Op0;
29014 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29015 SDValue HiZ;
29016 if (CurrVT.is512BitVector()) {
29017 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29018 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29019 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29020 } else {
29021 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29022 }
29023
29024 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29025 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29026 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29027 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29028
29029 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29030 // of the current vector width in the same way we did for the nibbles.
29031 // If the upper half of the input element is zero then add the halves'
29032 // leading zero counts together, otherwise just use the upper half's.
29033 // Double the width of the result until we are at target width.
29034 while (CurrVT != VT) {
29035 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29036 int CurrNumElts = CurrVT.getVectorNumElements();
29037 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29038 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29039 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29040
29041 // Check if the upper half of the input element is zero.
29042 if (CurrVT.is512BitVector()) {
29043 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29044 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29045 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29046 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29047 } else {
29048 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29049 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29050 }
29051 HiZ = DAG.getBitcast(NextVT, HiZ);
29052
29053 // Move the upper/lower halves to the lower bits as we'll be extending to
29054 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29055 // together.
29056 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29057 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29058 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29059 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29060 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29061 CurrVT = NextVT;
29062 }
29063
29064 return Res;
29065}
29066
29068 const X86Subtarget &Subtarget,
29069 SelectionDAG &DAG) {
29070 MVT VT = Op.getSimpleValueType();
29071
29072 if (Subtarget.hasCDI() &&
29073 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29074 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29075 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29076
29077 // Decompose 256-bit ops into smaller 128-bit ops.
29078 if (VT.is256BitVector() && !Subtarget.hasInt256())
29079 return splitVectorIntUnary(Op, DAG, DL);
29080
29081 // Decompose 512-bit ops into smaller 256-bit ops.
29082 if (VT.is512BitVector() && !Subtarget.hasBWI())
29083 return splitVectorIntUnary(Op, DAG, DL);
29084
29085 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29086 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29087}
29088
29090 SelectionDAG &DAG,
29091 const X86Subtarget &Subtarget) {
29092 MVT VT = Op.getSimpleValueType();
29093 SDValue Input = Op.getOperand(0);
29094
29095 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29096 "Expected vXi8 input for GFNI-based CTLZ lowering");
29097
29098 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29099
29100 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29101 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29102
29103 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29104 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29105 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29106
29107 SDValue LZCNT =
29108 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29109 DAG.getTargetConstant(8, DL, MVT::i8));
29110 return LZCNT;
29111}
29112
29113static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29114 SelectionDAG &DAG) {
29115 MVT VT = Op.getSimpleValueType();
29116 MVT OpVT = VT;
29117 unsigned NumBits = VT.getSizeInBits();
29118 SDLoc dl(Op);
29119 unsigned Opc = Op.getOpcode();
29120
29121 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29122 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29123
29124 if (VT.isVector())
29125 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29126
29127 Op = Op.getOperand(0);
29128 if (VT == MVT::i8) {
29129 // Zero extend to i32 since there is not an i8 bsr.
29130 OpVT = MVT::i32;
29131 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29132 }
29133
29134 // Check if we can safely pass a result though BSR for zero sources.
29135 SDValue PassThru = DAG.getUNDEF(OpVT);
29136 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29137 !DAG.isKnownNeverZero(Op))
29138 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29139
29140 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29141 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29142 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29143
29144 // Skip CMOV if we're using a pass through value.
29145 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29146 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29147 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29148 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29149 Op.getValue(1)};
29150 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29151 }
29152
29153 // Finally xor with NumBits-1.
29154 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29155 DAG.getConstant(NumBits - 1, dl, OpVT));
29156
29157 if (VT == MVT::i8)
29158 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29159 return Op;
29160}
29161
29162static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29163 SelectionDAG &DAG) {
29164 MVT VT = Op.getSimpleValueType();
29165 unsigned NumBits = VT.getScalarSizeInBits();
29166 SDValue N0 = Op.getOperand(0);
29167 SDLoc dl(Op);
29168 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29169
29170 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29171 "Only scalar CTTZ requires custom lowering");
29172
29173 // Check if we can safely pass a result though BSF for zero sources.
29174 SDValue PassThru = DAG.getUNDEF(VT);
29175 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29176 PassThru = DAG.getConstant(NumBits, dl, VT);
29177
29178 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29179 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29180 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29181
29182 // Skip CMOV if src is never zero or we're using a pass through value.
29183 if (NonZeroSrc || !PassThru.isUndef())
29184 return Op;
29185
29186 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29187 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29188 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29189 Op.getValue(1)};
29190 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29191}
29192
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Op.getSimpleValueType();
29196 SDLoc DL(Op);
29197
29198 if (VT == MVT::i16 || VT == MVT::i32)
29199 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29200
29201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29202 return splitVectorIntBinary(Op, DAG, DL);
29203
29204 assert(Op.getSimpleValueType().is256BitVector() &&
29205 Op.getSimpleValueType().isInteger() &&
29206 "Only handle AVX 256-bit vector integer operation");
29207 return splitVectorIntBinary(Op, DAG, DL);
29208}
29209
29211 const X86Subtarget &Subtarget) {
29212 MVT VT = Op.getSimpleValueType();
29213 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29214 unsigned Opcode = Op.getOpcode();
29215 SDLoc DL(Op);
29216
29217 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29218 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29219 assert(Op.getSimpleValueType().isInteger() &&
29220 "Only handle AVX vector integer operation");
29221 return splitVectorIntBinary(Op, DAG, DL);
29222 }
29223
29224 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29226 EVT SetCCResultType =
29227 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29228
29229 unsigned BitWidth = VT.getScalarSizeInBits();
29230 if (Opcode == ISD::USUBSAT) {
29231 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29232 // Handle a special-case with a bit-hack instead of cmp+select:
29233 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29234 // If the target can use VPTERNLOG, DAGToDAG will match this as
29235 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29236 // "broadcast" constant load.
29238 if (C && C->getAPIntValue().isSignMask()) {
29239 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29240 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29241 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29242 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29243 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29244 }
29245 }
29246 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29247 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29248 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29249 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29250 // TODO: Move this to DAGCombiner?
29251 if (SetCCResultType == VT &&
29252 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29253 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29254 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29255 }
29256 }
29257
29258 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29259 (!VT.isVector() || VT == MVT::v2i64)) {
29262 SDValue Zero = DAG.getConstant(0, DL, VT);
29263 SDValue Result =
29264 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29265 DAG.getVTList(VT, SetCCResultType), X, Y);
29266 SDValue SumDiff = Result.getValue(0);
29267 SDValue Overflow = Result.getValue(1);
29268 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29269 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29270 SDValue SumNeg =
29271 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29272 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29273 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29274 }
29275
29276 // Use default expansion.
29277 return SDValue();
29278}
29279
29280static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29281 SelectionDAG &DAG) {
29282 MVT VT = Op.getSimpleValueType();
29283 SDLoc DL(Op);
29284
29285 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29286 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29287 // 8-bit integer abs to NEG and CMOV.
29288 SDValue N0 = Op.getOperand(0);
29289 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29290 DAG.getConstant(0, DL, VT), N0);
29291 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29292 SDValue(Neg.getNode(), 1)};
29293 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29294 }
29295
29296 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29297 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29298 SDValue Src = Op.getOperand(0);
29299 SDValue Neg = DAG.getNegative(Src, DL, VT);
29300 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29301 }
29302
29303 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29304 assert(VT.isInteger() &&
29305 "Only handle AVX 256-bit vector integer operation");
29306 return splitVectorIntUnary(Op, DAG, DL);
29307 }
29308
29309 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29310 return splitVectorIntUnary(Op, DAG, DL);
29311
29312 // Default to expand.
29313 return SDValue();
29314}
29315
29316static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29317 SelectionDAG &DAG) {
29318 MVT VT = Op.getSimpleValueType();
29319 SDLoc DL(Op);
29320
29321 // For AVX1 cases, split to use legal ops.
29322 if (VT.is256BitVector() && !Subtarget.hasInt256())
29323 return splitVectorIntBinary(Op, DAG, DL);
29324
29325 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29326 return splitVectorIntBinary(Op, DAG, DL);
29327
29328 // Default to expand.
29329 return SDValue();
29330}
29331
29332static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29333 SelectionDAG &DAG) {
29334 MVT VT = Op.getSimpleValueType();
29335 SDLoc DL(Op);
29336
29337 // For AVX1 cases, split to use legal ops.
29338 if (VT.is256BitVector() && !Subtarget.hasInt256())
29339 return splitVectorIntBinary(Op, DAG, DL);
29340
29341 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29342 return splitVectorIntBinary(Op, DAG, DL);
29343
29344 // Default to expand.
29345 return SDValue();
29346}
29347
29349 SelectionDAG &DAG) {
29350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29351 EVT VT = Op.getValueType();
29352 SDValue X = Op.getOperand(0);
29353 SDValue Y = Op.getOperand(1);
29354 SDLoc DL(Op);
29355 bool IsMaxOp =
29356 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29357 bool IsNum =
29358 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29359 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29360 unsigned Opc = 0;
29361 if (VT.isVector())
29363 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29365
29366 if (Opc) {
29367 SDValue Imm =
29368 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29369 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29370 }
29371 }
29372
29373 uint64_t SizeInBits = VT.getScalarSizeInBits();
29374 APInt PreferredZero = APInt::getZero(SizeInBits);
29375 APInt OppositeZero = PreferredZero;
29376 EVT IVT = VT.changeTypeToInteger();
29377 X86ISD::NodeType MinMaxOp;
29378 if (IsMaxOp) {
29379 MinMaxOp = X86ISD::FMAX;
29380 OppositeZero.setSignBit();
29381 } else {
29382 PreferredZero.setSignBit();
29383 MinMaxOp = X86ISD::FMIN;
29384 }
29385 EVT SetCCType =
29386 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29387
29388 // The tables below show the expected result of Max in cases of NaN and
29389 // signed zeros.
29390 //
29391 // Y Y
29392 // Num xNaN +0 -0
29393 // --------------- ---------------
29394 // Num | Max | Y | +0 | +0 | +0 |
29395 // X --------------- X ---------------
29396 // xNaN | X | X/Y | -0 | +0 | -0 |
29397 // --------------- ---------------
29398 //
29399 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29400 // reordering.
29401 //
29402 // We check if any of operands is NaN and return NaN. Then we check if any of
29403 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29404 // to ensure the correct zero is returned.
29405 auto MatchesZero = [](SDValue Op, APInt Zero) {
29407 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29408 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29409 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29410 return CstOp->getAPIntValue() == Zero;
29411 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29412 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29413 for (const SDValue &OpVal : Op->op_values()) {
29414 if (OpVal.isUndef())
29415 continue;
29416 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29417 if (!CstOp)
29418 return false;
29419 if (!CstOp->getValueAPF().isZero())
29420 continue;
29421 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29422 return false;
29423 }
29424 return true;
29425 }
29426 return false;
29427 };
29428
29429 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29430 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29431 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29432 Op->getFlags().hasNoSignedZeros() ||
29433 DAG.isKnownNeverZeroFloat(X) ||
29435 SDValue NewX, NewY;
29436 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29437 MatchesZero(X, OppositeZero)) {
29438 // Operands are already in right order or order does not matter.
29439 NewX = X;
29440 NewY = Y;
29441 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29442 NewX = Y;
29443 NewY = X;
29444 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29445 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29446 if (IsXNeverNaN)
29447 std::swap(X, Y);
29448 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29449 // xmm register.
29450 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29452 // Bits of classes:
29453 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29454 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29455 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29456 DL, MVT::i32);
29457 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29458 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29459 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29460 DAG.getVectorIdxConstant(0, DL));
29461 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29462 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29463 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29464 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29465 } else {
29466 SDValue IsXSigned;
29467 if (Subtarget.is64Bit() || VT != MVT::f64) {
29468 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29469 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29470 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29471 } else {
29472 assert(VT == MVT::f64);
29473 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29474 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29475 DAG.getVectorIdxConstant(0, DL));
29476 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29477 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29478 DAG.getVectorIdxConstant(1, DL));
29479 Hi = DAG.getBitcast(MVT::i32, Hi);
29480 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29481 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29482 *DAG.getContext(), MVT::i32);
29483 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29484 }
29485 if (MinMaxOp == X86ISD::FMAX) {
29486 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29487 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29488 } else {
29489 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29490 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29491 }
29492 }
29493
29494 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29495 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29496
29497 // If we did no ordering operands for signed zero handling and we need
29498 // to process NaN and we know that one of the operands is not NaN then:
29499 // - For minimum/maximum, put it in the first operand,
29500 // - For minimumnum/maximumnum, put it in the second operand,
29501 // and we will not need to post handle NaN after max/min.
29502 if (IgnoreSignedZero && !IgnoreNaN &&
29503 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29504 std::swap(NewX, NewY);
29505
29506 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29507
29508 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29509 return MinMax;
29510
29511 if (DAG.isKnownNeverNaN(NewX))
29512 NewX = NewY;
29513
29514 SDValue IsNaN =
29515 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29516
29517 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29518}
29519
29520static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29521 SelectionDAG &DAG) {
29522 MVT VT = Op.getSimpleValueType();
29523 SDLoc dl(Op);
29524
29525 // For AVX1 cases, split to use legal ops.
29526 if (VT.is256BitVector() && !Subtarget.hasInt256())
29527 return splitVectorIntBinary(Op, DAG, dl);
29528
29529 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29530 return splitVectorIntBinary(Op, DAG, dl);
29531
29532 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29534
29535 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29536 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29537 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29538
29539 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29541 if (VT.bitsGE(MVT::i32)) {
29542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29543 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29544 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29545 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29546 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29547 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29548 DAG.getTargetConstant(CC, dl, MVT::i8),
29549 Diff1.getValue(1));
29550 }
29551
29552 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29553 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29554 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29555 MVT WideVT = MVT::getIntegerVT(WideBits);
29556 if (TLI.isTypeLegal(WideVT)) {
29557 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29558 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29559 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29560 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29561 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29562 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29563 DAG.getTargetConstant(CC, dl, MVT::i8),
29564 Diff1.getValue(1));
29565 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29566 }
29567 }
29568
29569 // Default to expand.
29570 return SDValue();
29571}
29572
29573static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29574 SelectionDAG &DAG) {
29575 SDLoc dl(Op);
29576 MVT VT = Op.getSimpleValueType();
29577
29578 // Decompose 256-bit ops into 128-bit ops.
29579 if (VT.is256BitVector() && !Subtarget.hasInt256())
29580 return splitVectorIntBinary(Op, DAG, dl);
29581
29582 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29583 return splitVectorIntBinary(Op, DAG, dl);
29584
29585 SDValue A = Op.getOperand(0);
29586 SDValue B = Op.getOperand(1);
29587
29588 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29589 // vector pairs, multiply and truncate.
29590 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29591 unsigned NumElts = VT.getVectorNumElements();
29592 unsigned NumLanes = VT.getSizeInBits() / 128;
29593 unsigned NumEltsPerLane = NumElts / NumLanes;
29594
29595 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29596 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29597 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29598 return DAG.getNode(
29599 ISD::TRUNCATE, dl, VT,
29600 DAG.getNode(ISD::MUL, dl, ExVT,
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29602 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29603 }
29604
29605 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29606
29607 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29608 // Don't do this if we only need to unpack one half.
29609 if (Subtarget.hasSSSE3()) {
29610 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29611 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29612 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29613 if (BIsBuildVector) {
29614 for (auto [Idx, Val] : enumerate(B->ops())) {
29615 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29616 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29617 else
29618 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29619 }
29620 }
29621 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29622 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29623 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29624 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29625 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29626 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29627 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29628 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29629 DAG.getTargetConstant(8, dl, MVT::i8));
29630 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29631 }
29632 }
29633
29634 // Extract the lo/hi parts to any extend to i16.
29635 // We're going to mask off the low byte of each result element of the
29636 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29637 // element.
29638 SDValue Undef = DAG.getUNDEF(VT);
29639 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29640 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29641
29642 SDValue BLo, BHi;
29643 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29644 // If the RHS is a constant, manually unpackl/unpackh.
29645 SmallVector<SDValue, 16> LoOps, HiOps;
29646 for (unsigned i = 0; i != NumElts; i += 16) {
29647 for (unsigned j = 0; j != 8; ++j) {
29648 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29649 MVT::i16));
29650 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29651 MVT::i16));
29652 }
29653 }
29654
29655 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29656 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29657 } else {
29658 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29659 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29660 }
29661
29662 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29663 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29664 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29665 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29666 }
29667
29668 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29669 if (VT == MVT::v4i32) {
29670 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29671 "Should not custom lower when pmulld is available!");
29672
29673 // Extract the odd parts.
29674 static const int UnpackMask[] = {1, 1, 3, 3};
29675 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29676 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29677
29678 // Multiply the even parts.
29679 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29680 DAG.getBitcast(MVT::v2i64, A),
29681 DAG.getBitcast(MVT::v2i64, B));
29682 // Now multiply odd parts.
29683 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29684 DAG.getBitcast(MVT::v2i64, Aodds),
29685 DAG.getBitcast(MVT::v2i64, Bodds));
29686
29687 Evens = DAG.getBitcast(VT, Evens);
29688 Odds = DAG.getBitcast(VT, Odds);
29689
29690 // Merge the two vectors back together with a shuffle. This expands into 2
29691 // shuffles.
29692 static const int ShufMask[] = { 0, 4, 2, 6 };
29693 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29694 }
29695
29696 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29697 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29698 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29699
29700 // Ahi = psrlqi(a, 32);
29701 // Bhi = psrlqi(b, 32);
29702 //
29703 // AloBlo = pmuludq(a, b);
29704 // AloBhi = pmuludq(a, Bhi);
29705 // AhiBlo = pmuludq(Ahi, b);
29706 //
29707 // Hi = psllqi(AloBhi + AhiBlo, 32);
29708 // return AloBlo + Hi;
29709 KnownBits AKnown = DAG.computeKnownBits(A);
29710 KnownBits BKnown = DAG.computeKnownBits(B);
29711
29712 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29713 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29714 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29715
29716 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29717 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29718 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29719
29720 SDValue Zero = DAG.getConstant(0, dl, VT);
29721
29722 // Only multiply lo/hi halves that aren't known to be zero.
29723 SDValue AloBlo = Zero;
29724 if (!ALoIsZero && !BLoIsZero)
29725 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29726
29727 SDValue AloBhi = Zero;
29728 if (!ALoIsZero && !BHiIsZero) {
29729 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29730 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29731 }
29732
29733 SDValue AhiBlo = Zero;
29734 if (!AHiIsZero && !BLoIsZero) {
29735 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29736 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29737 }
29738
29739 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29740 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29741
29742 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29743}
29744
29746 MVT VT, bool IsSigned,
29747 const X86Subtarget &Subtarget,
29748 SelectionDAG &DAG,
29749 SDValue *Low = nullptr) {
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29753 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29754 // lane results back together.
29755
29756 // We'll take different approaches for signed and unsigned.
29757 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29758 // and use pmullw to calculate the full 16-bit product.
29759 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29760 // shift them left into the upper byte of each word. This allows us to use
29761 // pmulhw to calculate the full 16-bit product. This trick means we don't
29762 // need to sign extend the bytes to use pmullw.
29763
29764 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29765 SDValue Zero = DAG.getConstant(0, dl, VT);
29766
29767 SDValue ALo, AHi;
29768 if (IsSigned) {
29769 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29770 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29771 } else {
29772 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29773 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29774 }
29775
29776 SDValue BLo, BHi;
29777 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29778 // If the RHS is a constant, manually unpackl/unpackh and extend.
29779 SmallVector<SDValue, 16> LoOps, HiOps;
29780 for (unsigned i = 0; i != NumElts; i += 16) {
29781 for (unsigned j = 0; j != 8; ++j) {
29782 SDValue LoOp = B.getOperand(i + j);
29783 SDValue HiOp = B.getOperand(i + j + 8);
29784
29785 if (IsSigned) {
29786 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29787 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29788 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29789 DAG.getConstant(8, dl, MVT::i16));
29790 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29791 DAG.getConstant(8, dl, MVT::i16));
29792 } else {
29793 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29794 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29795 }
29796
29797 LoOps.push_back(LoOp);
29798 HiOps.push_back(HiOp);
29799 }
29800 }
29801
29802 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29803 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29804 } else if (IsSigned) {
29805 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29806 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29807 } else {
29808 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29809 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29810 }
29811
29812 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29813 // pack back to vXi8.
29814 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29815 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29816 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29817
29818 if (Low)
29819 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29820
29821 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29822}
29823
29824static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29825 SelectionDAG &DAG) {
29826 SDLoc dl(Op);
29827 MVT VT = Op.getSimpleValueType();
29828 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29829 unsigned NumElts = VT.getVectorNumElements();
29830 SDValue A = Op.getOperand(0);
29831 SDValue B = Op.getOperand(1);
29832
29833 // Decompose 256-bit ops into 128-bit ops.
29834 if (VT.is256BitVector() && !Subtarget.hasInt256())
29835 return splitVectorIntBinary(Op, DAG, dl);
29836
29837 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29838 return splitVectorIntBinary(Op, DAG, dl);
29839
29840 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29841 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29842 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29843 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29844
29845 // PMULxD operations multiply each even value (starting at 0) of LHS with
29846 // the related value of RHS and produce a widen result.
29847 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 //
29850 // In other word, to have all the results, we need to perform two PMULxD:
29851 // 1. one with the even values.
29852 // 2. one with the odd values.
29853 // To achieve #2, with need to place the odd values at an even position.
29854 //
29855 // Place the odd value at an even position (basically, shift all values 1
29856 // step to the left):
29857 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29858 9, -1, 11, -1, 13, -1, 15, -1};
29859 // <a|b|c|d> => <b|undef|d|undef>
29860 SDValue Odd0 =
29861 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29862 // <e|f|g|h> => <f|undef|h|undef>
29863 SDValue Odd1 =
29864 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29865
29866 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29867 // ints.
29868 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29869 unsigned Opcode =
29870 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29871 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29872 // => <2 x i64> <ae|cg>
29873 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29874 DAG.getBitcast(MulVT, A),
29875 DAG.getBitcast(MulVT, B)));
29876 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29877 // => <2 x i64> <bf|dh>
29878 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29879 DAG.getBitcast(MulVT, Odd0),
29880 DAG.getBitcast(MulVT, Odd1)));
29881
29882 // Shuffle it back into the right order.
29883 SmallVector<int, 16> ShufMask(NumElts);
29884 for (int i = 0; i != (int)NumElts; ++i)
29885 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29886
29887 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29888
29889 // If we have a signed multiply but no PMULDQ fix up the result of an
29890 // unsigned multiply.
29891 if (IsSigned && !Subtarget.hasSSE41()) {
29892 SDValue Zero = DAG.getConstant(0, dl, VT);
29893 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29894 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29895 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29896 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29897
29898 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29899 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29900 }
29901
29902 return Res;
29903 }
29904
29905 // Only i8 vectors should need custom lowering after this.
29906 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29907 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29908 "Unsupported vector type");
29909
29910 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29911 // logical shift down the upper half and pack back to i8.
29912
29913 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29914 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29915
29916 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29917 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29918 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29919 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29920 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29921 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29922 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29923 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29924 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29925 }
29926
29927 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29928}
29929
29930// Custom lowering for SMULO/UMULO.
29931static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29932 SelectionDAG &DAG) {
29933 MVT VT = Op.getSimpleValueType();
29934
29935 // Scalars defer to LowerXALUO.
29936 if (!VT.isVector())
29937 return LowerXALUO(Op, DAG);
29938
29939 SDLoc dl(Op);
29940 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29941 SDValue A = Op.getOperand(0);
29942 SDValue B = Op.getOperand(1);
29943 EVT OvfVT = Op->getValueType(1);
29944
29945 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29946 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29947 // Extract the LHS Lo/Hi vectors
29948 SDValue LHSLo, LHSHi;
29949 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29950
29951 // Extract the RHS Lo/Hi vectors
29952 SDValue RHSLo, RHSHi;
29953 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29954
29955 EVT LoOvfVT, HiOvfVT;
29956 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29957 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29958 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29959
29960 // Issue the split operations.
29961 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29962 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29963
29964 // Join the separate data results and the overflow results.
29965 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29966 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29967 Hi.getValue(1));
29968
29969 return DAG.getMergeValues({Res, Ovf}, dl);
29970 }
29971
29972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29973 EVT SetccVT =
29974 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29975
29976 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29977 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29978 unsigned NumElts = VT.getVectorNumElements();
29979 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29980 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29981 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29982 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29983 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29984
29985 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29986
29987 SDValue Ovf;
29988 if (IsSigned) {
29989 SDValue High, LowSign;
29990 if (OvfVT.getVectorElementType() == MVT::i1 &&
29991 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29992 // Rather the truncating try to do the compare on vXi16 or vXi32.
29993 // Shift the high down filling with sign bits.
29994 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29995 // Fill all 16 bits with the sign bit from the low.
29996 LowSign =
29997 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29998 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29999 15, DAG);
30000 SetccVT = OvfVT;
30001 if (!Subtarget.hasBWI()) {
30002 // We can't do a vXi16 compare so sign extend to v16i32.
30003 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30004 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30005 }
30006 } else {
30007 // Otherwise do the compare at vXi8.
30008 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30009 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30010 LowSign =
30011 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30012 }
30013
30014 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30015 } else {
30016 SDValue High =
30017 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30018 if (OvfVT.getVectorElementType() == MVT::i1 &&
30019 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30020 // Rather the truncating try to do the compare on vXi16 or vXi32.
30021 SetccVT = OvfVT;
30022 if (!Subtarget.hasBWI()) {
30023 // We can't do a vXi16 compare so sign extend to v16i32.
30024 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30025 }
30026 } else {
30027 // Otherwise do the compare at vXi8.
30028 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30029 }
30030
30031 Ovf =
30032 DAG.getSetCC(dl, SetccVT, High,
30033 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30034 }
30035
30036 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30037
30038 return DAG.getMergeValues({Low, Ovf}, dl);
30039 }
30040
30041 SDValue Low;
30042 SDValue High =
30043 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30044
30045 SDValue Ovf;
30046 if (IsSigned) {
30047 // SMULO overflows if the high bits don't match the sign of the low.
30048 SDValue LowSign =
30049 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30050 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30051 } else {
30052 // UMULO overflows if the high bits are non-zero.
30053 Ovf =
30054 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30055 }
30056
30057 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30058
30059 return DAG.getMergeValues({Low, Ovf}, dl);
30060}
30061
30062SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30063 assert(Subtarget.isTargetWin64() && "Unexpected target");
30064 EVT VT = Op.getValueType();
30065 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30066 "Unexpected return type for lowering");
30067
30068 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30070 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30071 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30072 }
30073
30074 RTLIB::Libcall LC;
30075 bool isSigned;
30076 switch (Op->getOpcode()) {
30077 // clang-format off
30078 default: llvm_unreachable("Unexpected request for libcall!");
30079 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30080 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30081 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30082 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30083 // clang-format on
30084 }
30085
30086 SDLoc dl(Op);
30087 SDValue InChain = DAG.getEntryNode();
30088
30090 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30091 EVT ArgVT = Op->getOperand(i).getValueType();
30092 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30093 "Unexpected argument type for lowering");
30094 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30095 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30096 MachinePointerInfo MPI =
30098 InChain =
30099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30100 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30101 }
30102
30105
30106 TargetLowering::CallLoweringInfo CLI(DAG);
30107 CLI.setDebugLoc(dl)
30108 .setChain(InChain)
30109 .setLibCallee(
30111 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30112 std::move(Args))
30113 .setInRegister()
30114 .setSExtResult(isSigned)
30115 .setZExtResult(!isSigned);
30116
30117 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30118 return DAG.getBitcast(VT, CallInfo.first);
30119}
30120
30121SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30122 SelectionDAG &DAG,
30123 SDValue &Chain) const {
30124 assert(Subtarget.isTargetWin64() && "Unexpected target");
30125 EVT VT = Op.getValueType();
30126 bool IsStrict = Op->isStrictFPOpcode();
30127
30128 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30129 EVT ArgVT = Arg.getValueType();
30130
30131 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30132 "Unexpected return type for lowering");
30133
30134 RTLIB::Libcall LC;
30135 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30136 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30137 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30138 else
30139 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30140 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30141
30142 SDLoc dl(Op);
30143 MakeLibCallOptions CallOptions;
30144 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30145
30147 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30148 // expected VT (i128).
30149 std::tie(Result, Chain) =
30150 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30151 Result = DAG.getBitcast(VT, Result);
30152 return Result;
30153}
30154
30155SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30156 SelectionDAG &DAG) const {
30157 assert(Subtarget.isTargetWin64() && "Unexpected target");
30158 EVT VT = Op.getValueType();
30159 bool IsStrict = Op->isStrictFPOpcode();
30160
30161 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30162 EVT ArgVT = Arg.getValueType();
30163
30164 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30165 "Unexpected argument type for lowering");
30166
30167 RTLIB::Libcall LC;
30168 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30169 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30170 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30171 else
30172 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30173 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30174
30175 SDLoc dl(Op);
30176 MakeLibCallOptions CallOptions;
30177 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30178
30179 // Pass the i128 argument as an indirect argument on the stack.
30180 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30181 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30182 MachinePointerInfo MPI =
30184 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30185
30187 std::tie(Result, Chain) =
30188 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30189 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30190}
30191
30192// Return true if the required (according to Opcode) shift-imm form is natively
30193// supported by the Subtarget
30194static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30195 unsigned Opcode) {
30196 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30197 "Unexpected shift opcode");
30198
30199 if (!VT.isSimple())
30200 return false;
30201
30202 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30203 return false;
30204
30205 if (VT.getScalarSizeInBits() < 16)
30206 return false;
30207
30208 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30209 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30210 return true;
30211
30212 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30213 (VT.is256BitVector() && Subtarget.hasInt256());
30214
30215 bool AShift = LShift && (Subtarget.hasAVX512() ||
30216 (VT != MVT::v2i64 && VT != MVT::v4i64));
30217 return (Opcode == ISD::SRA) ? AShift : LShift;
30218}
30219
30220// The shift amount is a variable, but it is the same for all vector lanes.
30221// These instructions are defined together with shift-immediate.
30222static
30224 unsigned Opcode) {
30225 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30226}
30227
30228// Return true if the required (according to Opcode) variable-shift form is
30229// natively supported by the Subtarget
30230static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30231 unsigned Opcode) {
30232 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30233 "Unexpected shift opcode");
30234
30235 if (!VT.isSimple())
30236 return false;
30237
30238 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30239 return false;
30240
30241 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30242 return false;
30243
30244 // vXi16 supported only on AVX-512, BWI
30245 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30246 return false;
30247
30248 if (Subtarget.hasAVX512() &&
30249 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30250 return true;
30251
30252 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30253 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30254 return (Opcode == ISD::SRA) ? AShift : LShift;
30255}
30256
30258 const X86Subtarget &Subtarget) {
30259 MVT VT = Op.getSimpleValueType();
30260 SDLoc dl(Op);
30261 SDValue R = Op.getOperand(0);
30262 SDValue Amt = Op.getOperand(1);
30263 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30264 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30265
30266 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30267 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30268 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30269 SDValue Ex = DAG.getBitcast(ExVT, R);
30270
30271 // ashr(R, 63) === cmp_slt(R, 0)
30272 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30273 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30274 "Unsupported PCMPGT op");
30275 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30276 }
30277
30278 if (ShiftAmt >= 32) {
30279 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30280 SDValue Upper =
30281 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30283 ShiftAmt - 32, DAG);
30284 if (VT == MVT::v2i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30286 if (VT == MVT::v4i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30288 {9, 1, 11, 3, 13, 5, 15, 7});
30289 } else {
30290 // SRA upper i32, SRL whole i64 and select lower i32.
30292 ShiftAmt, DAG);
30293 SDValue Lower =
30294 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30295 Lower = DAG.getBitcast(ExVT, Lower);
30296 if (VT == MVT::v2i64)
30297 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30298 if (VT == MVT::v4i64)
30299 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30300 {8, 1, 10, 3, 12, 5, 14, 7});
30301 }
30302 return DAG.getBitcast(VT, Ex);
30303 };
30304
30305 // Optimize shl/srl/sra with constant shift amount.
30306 APInt APIntShiftAmt;
30307 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30308 return SDValue();
30309
30310 // If the shift amount is out of range, return undef.
30311 if (APIntShiftAmt.uge(EltSizeInBits))
30312 return DAG.getUNDEF(VT);
30313
30314 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30315
30316 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30317 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30318
30319 // i64 SRA needs to be performed as partial shifts.
30320 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30321 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30322 Op.getOpcode() == ISD::SRA)
30323 return ArithmeticShiftRight64(ShiftAmt);
30324
30325 // If we're logical shifting an all-signbits value then we can just perform as
30326 // a mask.
30327 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30328 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30329 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30330 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30331 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30332 }
30333
30334 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30335 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30336 unsigned NumElts = VT.getVectorNumElements();
30337 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30338
30339 // Simple i8 add case
30340 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30341 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30342 // must be 0). (add undef, undef) however can be any value. To make this
30343 // safe, we must freeze R to ensure that register allocation uses the same
30344 // register for an undefined value. This ensures that the result will
30345 // still be even and preserves the original semantics.
30346 R = DAG.getFreeze(R);
30347 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30348 }
30349
30350 // ashr(R, 7) === cmp_slt(R, 0)
30351 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30352 SDValue Zeros = DAG.getConstant(0, dl, VT);
30353 if (VT.is512BitVector()) {
30354 assert(VT == MVT::v64i8 && "Unexpected element type!");
30355 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30356 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30357 }
30358 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30359 }
30360
30361 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30362 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30363 return SDValue();
30364
30365 if (Subtarget.hasGFNI()) {
30366 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30367 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30368 DAG.getTargetConstant(0, dl, MVT::i8));
30369 }
30370
30371 if (Op.getOpcode() == ISD::SHL) {
30372 // Make a large shift.
30373 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30374 ShiftAmt, DAG);
30375 SHL = DAG.getBitcast(VT, SHL);
30376 // Zero out the rightmost bits.
30377 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30378 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30379 }
30380 if (Op.getOpcode() == ISD::SRL) {
30381 // Make a large shift.
30382 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30383 ShiftAmt, DAG);
30384 SRL = DAG.getBitcast(VT, SRL);
30385 // Zero out the leftmost bits.
30386 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30387 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30388 }
30389 if (Op.getOpcode() == ISD::SRA) {
30390 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30391 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30392
30393 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30394 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30395 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30396 return Res;
30397 }
30398 llvm_unreachable("Unknown shift opcode.");
30399 }
30400
30401 return SDValue();
30402}
30403
30405 const X86Subtarget &Subtarget) {
30406 MVT VT = Op.getSimpleValueType();
30407 SDLoc dl(Op);
30408 SDValue R = Op.getOperand(0);
30409 SDValue Amt = Op.getOperand(1);
30410 unsigned Opcode = Op.getOpcode();
30411 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30412
30413 int BaseShAmtIdx = -1;
30414 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30415 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30416 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30417 Subtarget, DAG);
30418
30419 // vXi8 shifts - shift as v8i16 + mask result.
30420 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30421 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30422 VT == MVT::v64i8) &&
30423 !Subtarget.hasXOP()) {
30424 unsigned NumElts = VT.getVectorNumElements();
30425 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30426 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30427 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30428 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30429
30430 // Create the mask using vXi16 shifts. For shift-rights we need to move
30431 // the upper byte down before splatting the vXi8 mask.
30432 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30433 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30434 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30435 if (Opcode != ISD::SHL)
30436 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30437 8, DAG);
30438 BitMask = DAG.getBitcast(VT, BitMask);
30439 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30440 SmallVector<int, 64>(NumElts, 0));
30441
30442 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30443 DAG.getBitcast(ExtVT, R), BaseShAmt,
30444 BaseShAmtIdx, Subtarget, DAG);
30445 Res = DAG.getBitcast(VT, Res);
30446 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30447
30448 if (Opcode == ISD::SRA) {
30449 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30450 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30451 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30452 SignMask =
30453 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30454 BaseShAmtIdx, Subtarget, DAG);
30455 SignMask = DAG.getBitcast(VT, SignMask);
30456 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30457 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30458 }
30459 return Res;
30460 }
30461 }
30462 }
30463
30464 return SDValue();
30465}
30466
30467// Convert a shift/rotate left amount to a multiplication scale factor.
30469 const X86Subtarget &Subtarget,
30470 SelectionDAG &DAG) {
30471 MVT VT = Amt.getSimpleValueType();
30472 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30473 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30474 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30475 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30476 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30477 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30478 return SDValue();
30479
30480 MVT SVT = VT.getVectorElementType();
30481 unsigned SVTBits = SVT.getSizeInBits();
30482 unsigned NumElems = VT.getVectorNumElements();
30483
30484 APInt UndefElts;
30485 SmallVector<APInt> EltBits;
30486 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30487 APInt One(SVTBits, 1);
30488 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30489 for (unsigned I = 0; I != NumElems; ++I) {
30490 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30491 continue;
30492 uint64_t ShAmt = EltBits[I].getZExtValue();
30493 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30494 }
30495 return DAG.getBuildVector(VT, dl, Elts);
30496 }
30497
30498 // If the target doesn't support variable shifts, use either FP conversion
30499 // or integer multiplication to avoid shifting each element individually.
30500 if (VT == MVT::v4i32) {
30501 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30502 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30503 DAG.getConstant(0x3f800000U, dl, VT));
30504 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30505 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30506 }
30507
30508 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30509 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30510 SDValue Z = DAG.getConstant(0, dl, VT);
30511 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30512 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30513 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30514 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30515 if (Subtarget.hasSSE41())
30516 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30517 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30518 }
30519
30520 return SDValue();
30521}
30522
30523static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30524 SelectionDAG &DAG) {
30525 MVT VT = Op.getSimpleValueType();
30526 SDLoc dl(Op);
30527 SDValue R = Op.getOperand(0);
30528 SDValue Amt = Op.getOperand(1);
30529 unsigned NumElts = VT.getVectorNumElements();
30530 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30531 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30532
30533 unsigned Opc = Op.getOpcode();
30534 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30535 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30536
30537 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30538 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30539
30540 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30541 return V;
30542
30543 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30544 return V;
30545
30546 if (supportedVectorVarShift(VT, Subtarget, Opc))
30547 return Op;
30548
30549 // i64 vector arithmetic shift can be emulated with the transform:
30550 // M = lshr(SIGN_MASK, Amt)
30551 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30552 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30553 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30554 Opc == ISD::SRA) {
30555 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30556 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30557 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30558 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30559 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30560 return R;
30561 }
30562
30563 // XOP has 128-bit variable logical/arithmetic shifts.
30564 // +ve/-ve Amt = shift left/right.
30565 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30566 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30567 if (Opc == ISD::SRL || Opc == ISD::SRA)
30568 Amt = DAG.getNegative(Amt, dl, VT);
30569 if (Opc == ISD::SHL || Opc == ISD::SRL)
30570 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30571 if (Opc == ISD::SRA)
30572 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30573 }
30574
30575 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30576 // shifts per-lane and then shuffle the partial results back together.
30577 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30578 // Splat the shift amounts so the scalar shifts above will catch it.
30579 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30580 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30581 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30582 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30583 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30584 }
30585
30586 // Build a map of inrange constant amounts with element mask where they occur.
30588 if (ConstantAmt) {
30589 for (unsigned I = 0; I != NumElts; ++I) {
30590 SDValue A = Amt.getOperand(I);
30591 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30592 continue;
30593 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30594 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30595 if (!Inserted) {
30596 It->second.setBit(I);
30597 continue;
30598 }
30599 It->second = APInt::getOneBitSet(NumElts, I);
30600 }
30601 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30602 }
30603
30604 // If possible, lower this shift as a sequence of two shifts by
30605 // constant plus a BLENDing shuffle instead of scalarizing it.
30606 // Example:
30607 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30608 //
30609 // Could be rewritten as:
30610 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30611 //
30612 // The advantage is that the two shifts from the example would be
30613 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30614 if (UniqueCstAmt.size() == 2 &&
30615 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30616 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30617 unsigned AmtA = UniqueCstAmt.begin()->first;
30618 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30619 const APInt &MaskA = UniqueCstAmt.begin()->second;
30620 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30621 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30622 for (unsigned I = 0; I != NumElts; ++I) {
30623 if (MaskA[I])
30624 ShuffleMask[I] = I;
30625 if (MaskB[I])
30626 ShuffleMask[I] = I + NumElts;
30627 }
30628
30629 // Only perform this blend if we can perform it without loading a mask.
30630 if ((VT != MVT::v16i16 ||
30631 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30632 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30633 canWidenShuffleElements(ShuffleMask))) {
30634 SDValue Shift1 =
30635 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30636 SDValue Shift2 =
30637 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30638 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30639 }
30640 }
30641
30642 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30643 // using vYiM vector operations where X*N == Y*M and M > N.
30644 if (ConstantAmt &&
30645 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30646 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30647 !Subtarget.hasXOP()) {
30648 MVT NarrowScalarVT = VT.getScalarType();
30649 // We can do this extra fast if each pair of narrow elements is shifted by
30650 // the same amount by doing this SWAR style: use a shift to move the valid
30651 // bits to the right position, mask out any bits which crossed from one
30652 // element to the other.
30653 // This optimized lowering is only valid if the elements in a pair can
30654 // be treated identically.
30655 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30656 SmallVector<SDValue, 32> TmpAmtWideElts;
30657 int WideEltSizeInBits = EltSizeInBits;
30658 while (WideEltSizeInBits < 32) {
30659 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30660 // unprofitable.
30661 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30662 break;
30663 }
30664 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30665 bool SameShifts = true;
30666 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30667 unsigned DstI = SrcI / 2;
30668 // Both elements are undef? Make a note and keep going.
30669 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30670 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30671 continue;
30672 }
30673 // Even element is undef? We will shift it by the same shift amount as
30674 // the odd element.
30675 if (AmtWideElts[SrcI].isUndef()) {
30676 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30677 continue;
30678 }
30679 // Odd element is undef? We will shift it by the same shift amount as
30680 // the even element.
30681 if (AmtWideElts[SrcI + 1].isUndef()) {
30682 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30683 continue;
30684 }
30685 // Both elements are equal.
30686 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30687 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30688 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30689 continue;
30690 }
30691 // One of the provisional wide elements will not have the same shift
30692 // amount. Let's bail.
30693 SameShifts = false;
30694 break;
30695 }
30696 if (!SameShifts) {
30697 break;
30698 }
30699 WideEltSizeInBits *= 2;
30700 std::swap(TmpAmtWideElts, AmtWideElts);
30701 }
30702 APInt APIntShiftAmt;
30703 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30704 bool Profitable = WidenShift;
30705 // AVX512BW brings support for vpsllvw.
30706 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30707 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30708 Profitable = false;
30709 }
30710 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30711 // fairly cheaply in other ways.
30712 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30713 Profitable = false;
30714 }
30715 // Leave it up to GFNI if we have it around.
30716 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30717 // is probably a win to use other strategies in some cases.
30718 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30719 Profitable = false;
30720 }
30721
30722 // AVX1 does not have vpand which makes our masking impractical. It does
30723 // have vandps but that is an FP instruction and crossing FP<->int typically
30724 // has some cost.
30725 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30726 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30727 Profitable = false;
30728 }
30729 unsigned WideNumElts = AmtWideElts.size();
30730 // We are only dealing with identical pairs.
30731 if (Profitable && WideNumElts != NumElts) {
30732 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30733 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30734 // Cast the operand to vXiM.
30735 SDValue RWide = DAG.getBitcast(WideVT, R);
30736 // Create our new vector of shift amounts.
30737 SDValue AmtWide = DAG.getBuildVector(
30738 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30739 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30740 // Perform the actual shift.
30741 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30742 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30743 // Now we need to construct a mask which will "drop" bits that get
30744 // shifted past the LSB/MSB. For a logical shift left, it will look
30745 // like:
30746 // FullMask = (1 << EltSizeInBits) - 1
30747 // Mask = FullMask << Amt
30748 //
30749 // This masking ensures that bits cannot migrate from one narrow lane to
30750 // another. The construction of this mask will be constant folded.
30751 // The mask for a logical right shift is nearly identical, the only
30752 // difference is that the all ones mask is shifted right instead of left.
30753 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30754 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30755 Mask = DAG.getBitcast(WideVT, Mask);
30756 // Finally, we mask the shifted vector with the SWAR mask.
30757 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30758 Masked = DAG.getBitcast(VT, Masked);
30759 if (Opc != ISD::SRA) {
30760 // Logical shifts are complete at this point.
30761 return Masked;
30762 }
30763 // At this point, we have done a *logical* shift right. We now need to
30764 // sign extend the result so that we get behavior equivalent to an
30765 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30766 // are `EltSizeInBits-AmtWide` bits wide.
30767 //
30768 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30769 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30770 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30771 // can use the following trick to accomplish this:
30772 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30773 // (Masked ^ SignBitMask) - SignBitMask
30774 //
30775 // When the sign bit is already clear, this will compute:
30776 // Masked + SignBitMask - SignBitMask
30777 //
30778 // This is equal to Masked which is what we want: the sign bit was clear
30779 // so sign extending should be a no-op.
30780 //
30781 // When the sign bit is set, this will compute:
30782 // Masked - SignBitmask - SignBitMask
30783 //
30784 // This is equal to Masked - 2*SignBitMask which will correctly sign
30785 // extend our result.
30786 SDValue SplatHighBit =
30787 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30788 // This does not induce recursion, all operands are constants.
30789 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30790 SDValue FlippedSignBit =
30791 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30792 SDValue Subtraction =
30793 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30794 return Subtraction;
30795 }
30796 }
30797
30798 // If possible, lower this packed shift into a vector multiply instead of
30799 // expanding it into a sequence of scalar shifts.
30800 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30801 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30802 Subtarget.canExtendTo512BW())))
30803 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30804 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30805
30806 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30807 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30808 if (Opc == ISD::SRL && ConstantAmt &&
30809 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30810 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30811 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30812 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30813 SDValue Zero = DAG.getConstant(0, dl, VT);
30814 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30815 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30816 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30817 }
30818 }
30819
30820 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30821 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30822 // TODO: Special case handling for shift by 0/1, really we can afford either
30823 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30824 if (Opc == ISD::SRA && ConstantAmt &&
30825 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30826 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30827 !Subtarget.hasAVX512()) ||
30828 DAG.isKnownNeverZero(Amt))) {
30829 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30830 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30831 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30832 SDValue Amt0 =
30833 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30834 SDValue Amt1 =
30835 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30836 SDValue Sra1 =
30837 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30838 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30839 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30840 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30841 }
30842 }
30843
30844 // v4i32 Non Uniform Shifts.
30845 // If the shift amount is constant we can shift each lane using the SSE2
30846 // immediate shifts, else we need to zero-extend each lane to the lower i64
30847 // and shift using the SSE2 variable shifts.
30848 // The separate results can then be blended together.
30849 if (VT == MVT::v4i32) {
30850 SDValue Amt0, Amt1, Amt2, Amt3;
30851 if (ConstantAmt) {
30852 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30853 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30854 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30855 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30856 } else {
30857 // The SSE2 shifts use the lower i64 as the same shift amount for
30858 // all lanes and the upper i64 is ignored. On AVX we're better off
30859 // just zero-extending, but for SSE just duplicating the top 16-bits is
30860 // cheaper and has the same effect for out of range values.
30861 if (Subtarget.hasAVX()) {
30862 SDValue Z = DAG.getConstant(0, dl, VT);
30863 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30864 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30865 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30866 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30867 } else {
30868 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30869 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30870 {4, 5, 6, 7, -1, -1, -1, -1});
30871 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30872 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30873 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30874 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30875 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30876 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30877 }
30878 }
30879
30880 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30881 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30882 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30883 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30884 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30885
30886 // Merge the shifted lane results optimally with/without PBLENDW.
30887 // TODO - ideally shuffle combining would handle this.
30888 if (Subtarget.hasSSE41()) {
30889 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30890 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30891 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30892 }
30893 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30894 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30895 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30896 }
30897
30898 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30899 // look up the pre-computed shift values.
30900 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30901 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30902 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30903 unsigned NumLanes = VT.getSizeInBits() / 128u;
30904 unsigned NumEltsPerLane = NumElts / NumLanes;
30906 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30907 unsigned LoElt = Lane * NumEltsPerLane;
30908 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30909 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30910 if (!KnownLane.isConstant())
30911 break;
30912 const APInt &LaneSplat = KnownLane.getConstant();
30913 for (unsigned I = 0; I != 8; ++I) {
30914 if (Opc == ISD::SHL)
30915 LUT.push_back(LaneSplat.shl(I));
30916 else if (Opc == ISD::SRL)
30917 LUT.push_back(LaneSplat.lshr(I));
30918 else if (Opc == ISD::SRA)
30919 LUT.push_back(LaneSplat.ashr(I));
30920 }
30921 LUT.append(8, APInt::getZero(8));
30922 }
30923 if (LUT.size() == NumElts) {
30924 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30925 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30926 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30927 }
30928 }
30929
30930 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30931 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30932 // make the existing SSE solution better.
30933 // NOTE: We honor prefered vector width before promoting to 512-bits.
30934 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30935 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30936 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30937 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30938 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30939 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30940 "Unexpected vector type");
30941 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30942 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30943 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30944 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30945 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30946 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30947 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30948 }
30949
30950 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30951 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30952 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30953 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30954 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30955 !Subtarget.hasXOP()) {
30956 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30957 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30958
30959 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30960 // isn't legal).
30961 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30962 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30963 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30964 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30966 "Constant build vector expected");
30967
30968 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30969 bool IsSigned = Opc == ISD::SRA;
30970 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30971 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30972 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30973 return DAG.getZExtOrTrunc(R, dl, VT);
30974 }
30975
30976 SmallVector<SDValue, 16> LoAmt, HiAmt;
30977 for (unsigned i = 0; i != NumElts; i += 16) {
30978 for (int j = 0; j != 8; ++j) {
30979 LoAmt.push_back(Amt.getOperand(i + j));
30980 HiAmt.push_back(Amt.getOperand(i + j + 8));
30981 }
30982 }
30983
30984 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30985 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30986
30987 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30988 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30989 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30990 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30991 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30992 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30993 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30994 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30995 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30996 }
30997
30998 if (VT == MVT::v16i8 ||
30999 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31000 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31001 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31002
31003 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31004 if (VT.is512BitVector()) {
31005 // On AVX512BW targets we make use of the fact that VSELECT lowers
31006 // to a masked blend which selects bytes based just on the sign bit
31007 // extracted to a mask.
31008 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31009 V0 = DAG.getBitcast(VT, V0);
31010 V1 = DAG.getBitcast(VT, V1);
31011 Sel = DAG.getBitcast(VT, Sel);
31012 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31013 ISD::SETGT);
31014 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31015 } else if (Subtarget.hasSSE41()) {
31016 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31017 // on the sign bit.
31018 V0 = DAG.getBitcast(VT, V0);
31019 V1 = DAG.getBitcast(VT, V1);
31020 Sel = DAG.getBitcast(VT, Sel);
31021 return DAG.getBitcast(SelVT,
31022 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31023 }
31024 // On pre-SSE41 targets we test for the sign bit by comparing to
31025 // zero - a negative value will set all bits of the lanes to true
31026 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31027 SDValue Z = DAG.getConstant(0, dl, SelVT);
31028 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31029 return DAG.getSelect(dl, SelVT, C, V0, V1);
31030 };
31031
31032 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31033 // We can safely do this using i16 shifts as we're only interested in
31034 // the 3 lower bits of each byte.
31035 Amt = DAG.getBitcast(ExtVT, Amt);
31036 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31037 Amt = DAG.getBitcast(VT, Amt);
31038
31039 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31040 // r = VSELECT(r, shift(r, 4), a);
31041 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31042 R = SignBitSelect(VT, Amt, M, R);
31043
31044 // a += a
31045 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31046
31047 // r = VSELECT(r, shift(r, 2), a);
31048 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31049 R = SignBitSelect(VT, Amt, M, R);
31050
31051 // a += a
31052 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31053
31054 // return VSELECT(r, shift(r, 1), a);
31055 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31056 R = SignBitSelect(VT, Amt, M, R);
31057 return R;
31058 }
31059
31060 if (Opc == ISD::SRA) {
31061 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31062 // so we can correctly sign extend. We don't care what happens to the
31063 // lower byte.
31064 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31065 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31066 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31067 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31068 ALo = DAG.getBitcast(ExtVT, ALo);
31069 AHi = DAG.getBitcast(ExtVT, AHi);
31070 RLo = DAG.getBitcast(ExtVT, RLo);
31071 RHi = DAG.getBitcast(ExtVT, RHi);
31072
31073 // r = VSELECT(r, shift(r, 4), a);
31074 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31075 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31076 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31077 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31078
31079 // a += a
31080 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31081 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31082
31083 // r = VSELECT(r, shift(r, 2), a);
31084 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31085 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31086 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31087 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31088
31089 // a += a
31090 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31091 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31092
31093 // r = VSELECT(r, shift(r, 1), a);
31094 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31095 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31096 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31097 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31098
31099 // Logical shift the result back to the lower byte, leaving a zero upper
31100 // byte meaning that we can safely pack with PACKUSWB.
31101 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31102 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31103 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31104 }
31105 }
31106
31107 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31108 MVT ExtVT = MVT::v8i32;
31109 SDValue Z = DAG.getConstant(0, dl, VT);
31110 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31111 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31112 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31113 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31114 ALo = DAG.getBitcast(ExtVT, ALo);
31115 AHi = DAG.getBitcast(ExtVT, AHi);
31116 RLo = DAG.getBitcast(ExtVT, RLo);
31117 RHi = DAG.getBitcast(ExtVT, RHi);
31118 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31119 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31120 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31121 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31122 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31123 }
31124
31125 if (VT == MVT::v8i16) {
31126 // If we have a constant shift amount, the non-SSE41 path is best as
31127 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31128 bool UseSSE41 = Subtarget.hasSSE41() &&
31130
31131 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31132 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31133 // the sign bit.
31134 if (UseSSE41) {
31135 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31136 V0 = DAG.getBitcast(ExtVT, V0);
31137 V1 = DAG.getBitcast(ExtVT, V1);
31138 Sel = DAG.getBitcast(ExtVT, Sel);
31139 return DAG.getBitcast(
31140 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31141 }
31142 // On pre-SSE41 targets we splat the sign bit - a negative value will
31143 // set all bits of the lanes to true and VSELECT uses that in
31144 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31145 SDValue C =
31146 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31147 return DAG.getSelect(dl, VT, C, V0, V1);
31148 };
31149
31150 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31151 if (UseSSE41) {
31152 // On SSE41 targets we need to replicate the shift mask in both
31153 // bytes for PBLENDVB.
31154 Amt = DAG.getNode(
31155 ISD::OR, dl, VT,
31156 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31157 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31158 } else {
31159 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31160 }
31161
31162 // r = VSELECT(r, shift(r, 8), a);
31163 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31164 R = SignBitSelect(Amt, M, R);
31165
31166 // a += a
31167 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31168
31169 // r = VSELECT(r, shift(r, 4), a);
31170 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31171 R = SignBitSelect(Amt, M, R);
31172
31173 // a += a
31174 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31175
31176 // r = VSELECT(r, shift(r, 2), a);
31177 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31178 R = SignBitSelect(Amt, M, R);
31179
31180 // a += a
31181 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31182
31183 // return VSELECT(r, shift(r, 1), a);
31184 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31185 R = SignBitSelect(Amt, M, R);
31186 return R;
31187 }
31188
31189 // Decompose 256-bit shifts into 128-bit shifts.
31190 if (VT.is256BitVector())
31191 return splitVectorIntBinary(Op, DAG, dl);
31192
31193 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31194 return splitVectorIntBinary(Op, DAG, dl);
31195
31196 return SDValue();
31197}
31198
31200 SelectionDAG &DAG) {
31201 MVT VT = Op.getSimpleValueType();
31202 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31203 "Unexpected funnel shift opcode!");
31204
31205 SDLoc DL(Op);
31206 SDValue Op0 = Op.getOperand(0);
31207 SDValue Op1 = Op.getOperand(1);
31208 SDValue Amt = Op.getOperand(2);
31209 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31210 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31211
31212 if (VT.isVector()) {
31213 APInt APIntShiftAmt;
31214 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31215 unsigned NumElts = VT.getVectorNumElements();
31216
31217 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31218
31219 if (IsCstSplat) {
31220 if (IsFSHR)
31221 std::swap(Op0, Op1);
31222 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31223 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31224 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31225 {Op0, Op1, Imm}, DAG, Subtarget);
31226 }
31227 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31228 {Op0, Op1, Amt}, DAG, Subtarget);
31229 }
31230 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31231 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31232 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31233 "Unexpected funnel shift type!");
31234
31235 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31236 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31237 if (IsCstSplat) {
31238 // TODO: Can't use generic expansion as UNDEF amt elements can be
31239 // converted to other values when folded to shift amounts, losing the
31240 // splat.
31241 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31242 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31243 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31244 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31245 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31246
31247 if (EltSizeInBits == 8 &&
31248 (Subtarget.hasXOP() ||
31249 (useVPTERNLOG(Subtarget, VT) &&
31250 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31251 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31252 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31253 // the original vector width to handle cases where we split.
31254 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31255 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31256 SDValue ShX =
31257 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31258 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31259 SDValue ShY =
31260 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31261 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31262 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31263 DAG.getConstant(MaskX, DL, VT));
31264 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31265 DAG.getConstant(MaskY, DL, VT));
31266 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31267 }
31268
31269 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31270 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31271 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31272 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31273 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31274 }
31275
31276 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31277 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31278 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31279
31280 // Constant vXi16 funnel shifts can be efficiently handled by default.
31281 if (IsCst && EltSizeInBits == 16)
31282 return SDValue();
31283
31284 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31285 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31286 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31287
31288 // Split 256-bit integers on XOP/pre-AVX2 targets.
31289 // Split 512-bit integers on non 512-bit BWI targets.
31290 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31291 !Subtarget.hasAVX2())) ||
31292 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31293 EltSizeInBits < 32)) {
31294 // Pre-mask the amount modulo using the wider vector.
31295 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31296 return splitVectorOp(Op, DAG, DL);
31297 }
31298
31299 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31300 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31301 int ScalarAmtIdx = -1;
31302 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31303 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31304 if (EltSizeInBits == 16)
31305 return SDValue();
31306
31307 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31308 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31309 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31310 ScalarAmtIdx, Subtarget, DAG);
31311 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31312 ScalarAmtIdx, Subtarget, DAG);
31313 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31314 }
31315 }
31316
31317 MVT WideSVT = MVT::getIntegerVT(
31318 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31319 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31320
31321 // If per-element shifts are legal, fallback to generic expansion.
31322 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31323 return SDValue();
31324
31325 // Attempt to fold as:
31326 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31327 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31328 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31329 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31330 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31331 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31332 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31333 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31334 EltSizeInBits, DAG);
31335 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31336 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31337 if (!IsFSHR)
31338 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31339 EltSizeInBits, DAG);
31340 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31341 }
31342
31343 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31344 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31345 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31346 SDValue Z = DAG.getConstant(0, DL, VT);
31347 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31348 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31349 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31350 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31351 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31352 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31353 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31354 }
31355
31356 // Fallback to generic expansion.
31357 return SDValue();
31358 }
31359 assert(
31360 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31361 "Unexpected funnel shift type!");
31362
31363 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31364 bool OptForSize = DAG.shouldOptForSize();
31365 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31366
31367 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31368 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31369 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31370 !isa<ConstantSDNode>(Amt)) {
31371 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31372 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31373 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31374 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31375 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31376 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31377 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31378 if (IsFSHR) {
31379 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31380 } else {
31381 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31382 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31383 }
31384 return DAG.getZExtOrTrunc(Res, DL, VT);
31385 }
31386
31387 if (VT == MVT::i8 || ExpandFunnel)
31388 return SDValue();
31389
31390 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31391 if (VT == MVT::i16) {
31392 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31393 DAG.getConstant(15, DL, Amt.getValueType()));
31394 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31395 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31396 }
31397
31398 return Op;
31399}
31400
31401static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31402 SelectionDAG &DAG) {
31403 MVT VT = Op.getSimpleValueType();
31404 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31405
31406 SDLoc DL(Op);
31407 SDValue R = Op.getOperand(0);
31408 SDValue Amt = Op.getOperand(1);
31409 unsigned Opcode = Op.getOpcode();
31410 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31411 int NumElts = VT.getVectorNumElements();
31412 bool IsROTL = Opcode == ISD::ROTL;
31413
31414 // Check for constant splat rotation amount.
31415 APInt CstSplatValue;
31416 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31417
31418 // Check for splat rotate by zero.
31419 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31420 return R;
31421
31422 // AVX512 implicitly uses modulo rotation amounts.
31423 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31424 // Attempt to rotate by immediate.
31425 if (IsCstSplat) {
31426 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31427 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31428 return DAG.getNode(RotOpc, DL, VT, R,
31429 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31430 }
31431
31432 // Else, fall-back on VPROLV/VPRORV.
31433 return Op;
31434 }
31435
31436 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31437 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31438 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31439 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31440 }
31441
31442 SDValue Z = DAG.getConstant(0, DL, VT);
31443
31444 if (!IsROTL) {
31445 // If the ISD::ROTR amount is constant, we're always better converting to
31446 // ISD::ROTL.
31447 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31448 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31449
31450 // XOP targets always prefers ISD::ROTL.
31451 if (Subtarget.hasXOP())
31452 return DAG.getNode(ISD::ROTL, DL, VT, R,
31453 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31454 }
31455
31456 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31457 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31459 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31460 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31461 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31462 DAG.getTargetConstant(0, DL, MVT::i8));
31463 }
31464
31465 // Split 256-bit integers on XOP/pre-AVX2 targets.
31466 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31467 return splitVectorIntBinary(Op, DAG, DL);
31468
31469 // XOP has 128-bit vector variable + immediate rotates.
31470 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31471 // XOP implicitly uses modulo rotation amounts.
31472 if (Subtarget.hasXOP()) {
31473 assert(IsROTL && "Only ROTL expected");
31474 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31475
31476 // Attempt to rotate by immediate.
31477 if (IsCstSplat) {
31478 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31479 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31480 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31481 }
31482
31483 // Use general rotate by variable (per-element).
31484 return Op;
31485 }
31486
31487 // Rotate by an uniform constant - expand back to shifts.
31488 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31489 // to other values when folded to shift amounts, losing the splat.
31490 if (IsCstSplat) {
31491 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31492 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31493 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31494 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31495 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31496 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31497 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31498 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31499 }
31500
31501 // Split 512-bit integers on non 512-bit BWI targets.
31502 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31503 return splitVectorIntBinary(Op, DAG, DL);
31504
31505 assert(
31506 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31507 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31508 Subtarget.hasAVX2()) ||
31509 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31510 "Only vXi32/vXi16/vXi8 vector rotates supported");
31511
31512 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31513 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31514
31515 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31516 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31517
31518 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31519 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31520 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31521 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31522 int BaseRotAmtIdx = -1;
31523 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31524 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31525 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31526 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31527 }
31528 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31529 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31530 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31531 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31532 BaseRotAmtIdx, Subtarget, DAG);
31533 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31534 BaseRotAmtIdx, Subtarget, DAG);
31535 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31536 }
31537 }
31538
31539 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31540 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31541
31542 // Attempt to fold as unpack(x,x) << zext(y):
31543 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31544 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31545 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31546 if (!(ConstantAmt && EltSizeInBits != 8) &&
31547 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31548 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31549 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31550 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31551 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31552 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31553 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31554 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31555 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31556 }
31557
31558 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31559 // the amount bit.
31560 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31561 if (EltSizeInBits == 8) {
31562 MVT WideVT =
31563 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31564
31565 // Attempt to fold as:
31566 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31567 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31568 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31569 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31570 // If we're rotating by constant, just use default promotion.
31571 if (ConstantAmt)
31572 return SDValue();
31573 // See if we can perform this by widening to vXi16 or vXi32.
31574 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31575 R = DAG.getNode(
31576 ISD::OR, DL, WideVT, R,
31577 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31578 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31579 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31580 if (IsROTL)
31581 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31582 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31583 }
31584
31585 // We don't need ModuloAmt here as we just peek at individual bits.
31586 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31587 if (Subtarget.hasSSE41()) {
31588 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31589 // on the sign bit.
31590 V0 = DAG.getBitcast(VT, V0);
31591 V1 = DAG.getBitcast(VT, V1);
31592 Sel = DAG.getBitcast(VT, Sel);
31593 return DAG.getBitcast(SelVT,
31594 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31595 }
31596 // On pre-SSE41 targets we test for the sign bit by comparing to
31597 // zero - a negative value will set all bits of the lanes to true
31598 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31599 SDValue Z = DAG.getConstant(0, DL, SelVT);
31600 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31601 return DAG.getSelect(DL, SelVT, C, V0, V1);
31602 };
31603
31604 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31605 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31606 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31607 IsROTL = true;
31608 }
31609
31610 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31611 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31612
31613 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31614 // We can safely do this using i16 shifts as we're only interested in
31615 // the 3 lower bits of each byte.
31616 Amt = DAG.getBitcast(ExtVT, Amt);
31617 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31618 Amt = DAG.getBitcast(VT, Amt);
31619
31620 // r = VSELECT(r, rot(r, 4), a);
31621 SDValue M;
31622 M = DAG.getNode(
31623 ISD::OR, DL, VT,
31624 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31625 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31626 R = SignBitSelect(VT, Amt, M, R);
31627
31628 // a += a
31629 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31630
31631 // r = VSELECT(r, rot(r, 2), a);
31632 M = DAG.getNode(
31633 ISD::OR, DL, VT,
31634 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31635 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31636 R = SignBitSelect(VT, Amt, M, R);
31637
31638 // a += a
31639 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31640
31641 // return VSELECT(r, rot(r, 1), a);
31642 M = DAG.getNode(
31643 ISD::OR, DL, VT,
31644 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31645 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31646 return SignBitSelect(VT, Amt, M, R);
31647 }
31648
31649 bool IsSplatAmt = DAG.isSplatValue(Amt);
31650 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31651 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31652
31653 // Fallback for splats + all supported variable shifts.
31654 // Fallback for non-constants AVX2 vXi16 as well.
31655 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31656 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31657 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31658 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31659 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31660 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31661 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31662 }
31663
31664 // Everything below assumes ISD::ROTL.
31665 if (!IsROTL) {
31666 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31667 IsROTL = true;
31668 }
31669
31670 // ISD::ROT* uses modulo rotate amounts.
31671 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31672
31673 assert(IsROTL && "Only ROTL supported");
31674
31675 // As with shifts, attempt to convert the rotation amount to a multiplication
31676 // factor, fallback to general expansion.
31677 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31678 if (!Scale)
31679 return SDValue();
31680
31681 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31682 if (EltSizeInBits == 16) {
31683 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31684 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31685 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31686 }
31687
31688 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31689 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31690 // that can then be OR'd with the lower 32-bits.
31691 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31692 static const int OddMask[] = {1, 1, 3, 3};
31693 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31694 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31695
31696 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31697 DAG.getBitcast(MVT::v2i64, R),
31698 DAG.getBitcast(MVT::v2i64, Scale));
31699 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31700 DAG.getBitcast(MVT::v2i64, R13),
31701 DAG.getBitcast(MVT::v2i64, Scale13));
31702 Res02 = DAG.getBitcast(VT, Res02);
31703 Res13 = DAG.getBitcast(VT, Res13);
31704
31705 return DAG.getNode(ISD::OR, DL, VT,
31706 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31707 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31708}
31709
31710/// Returns true if the operand type is exactly twice the native width, and
31711/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31712/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31713/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31714bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31715 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31716
31717 if (OpWidth == 64)
31718 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31719 if (OpWidth == 128)
31720 return Subtarget.canUseCMPXCHG16B();
31721
31722 return false;
31723}
31724
31726X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31727 Type *MemType = SI->getValueOperand()->getType();
31728
31729 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31730 !Subtarget.useSoftFloat()) {
31731 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31732 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31734
31735 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31736 Subtarget.hasAVX())
31738 }
31739
31740 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31742}
31743
31744// Note: this turns large loads into lock cmpxchg8b/16b.
31746X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31747 Type *MemType = LI->getType();
31748
31749 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31750 !Subtarget.useSoftFloat()) {
31751 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31752 // can use movq to do the load. If we have X87 we can load into an 80-bit
31753 // X87 register and store it to a stack temporary.
31754 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31755 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31757
31758 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31759 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31760 Subtarget.hasAVX())
31762 }
31763
31764 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31766}
31767
31775
31776static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31777 using namespace llvm::PatternMatch;
31778 BitTestKind BTK = UndefBit;
31779 if (auto *C = dyn_cast<ConstantInt>(V)) {
31780 // Check if V is a power of 2 or NOT power of 2.
31781 if (isPowerOf2_64(C->getZExtValue()))
31782 BTK = ConstantBit;
31783 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31784 BTK = NotConstantBit;
31785 return {V, BTK};
31786 }
31787
31788 // Check if V is some power of 2 pattern known to be non-zero
31789 if (auto *I = dyn_cast<Instruction>(V)) {
31790 bool Not = false;
31791 // Check if we have a NOT
31792 Value *PeekI;
31793 if (match(I, m_Not(m_Value(PeekI))) ||
31794 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31795 Not = true;
31796 I = dyn_cast<Instruction>(PeekI);
31797
31798 // If I is constant, it will fold and we can evaluate later. If its an
31799 // argument or something of that nature, we can't analyze.
31800 if (I == nullptr)
31801 return {nullptr, UndefBit};
31802 }
31803 // We can only use 1 << X without more sophisticated analysis. C << X where
31804 // C is a power of 2 but not 1 can result in zero which cannot be translated
31805 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31806 if (I->getOpcode() == Instruction::Shl) {
31807 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31808 // -X` and some other provable power of 2 patterns that we can use CTZ on
31809 // may be profitable.
31810 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31811 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31812 // be provably a non-zero power of 2.
31813 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31814 // transformable to bittest.
31815 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31816 if (!ShiftVal)
31817 return {nullptr, UndefBit};
31818 if (ShiftVal->equalsInt(1))
31819 BTK = Not ? NotShiftBit : ShiftBit;
31820
31821 if (BTK == UndefBit)
31822 return {nullptr, UndefBit};
31823
31824 Value *BitV = I->getOperand(1);
31825
31826 // Read past a shiftmask instruction to find count
31827 Value *AndOp;
31828 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31829 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31830 BitV = AndOp;
31831
31832 return {BitV, BTK};
31833 }
31834 }
31835 return {nullptr, UndefBit};
31836}
31837
31839X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31840 using namespace llvm::PatternMatch;
31841 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31842 // prefix to a normal instruction for these operations.
31843 if (AI->use_empty())
31845
31846 if (AI->getOperation() == AtomicRMWInst::Xor) {
31847 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31848 // preferable to both `cmpxchg` and `btc`.
31849 if (match(AI->getOperand(1), m_SignMask()))
31851 }
31852
31853 // If the atomicrmw's result is used by a single bit AND, we may use
31854 // bts/btr/btc instruction for these operations.
31855 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31856 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31857 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31858 // detect it.
31859 Instruction *I = AI->user_back();
31860 auto BitChange = FindSingleBitChange(AI->getValOperand());
31861 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31862 I->getOpcode() != Instruction::And ||
31863 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31864 AI->getParent() != I->getParent())
31866
31867 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31868
31869 // This is a redundant AND, it should get cleaned up elsewhere.
31870 if (AI == I->getOperand(OtherIdx))
31872
31873 // The following instruction must be a AND single bit.
31874 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31875 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31876 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31877 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31879 }
31880 if (AI->getOperation() == AtomicRMWInst::And) {
31881 return ~C1->getValue() == C2->getValue()
31884 }
31887 }
31888
31889 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31890
31891 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31892 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31894
31895 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31896
31897 // If shift amounts are not the same we can't use BitTestIntrinsic.
31898 if (BitChange.first != BitTested.first)
31900
31901 // If atomic AND need to be masking all be one bit and testing the one bit
31902 // unset in the mask.
31903 if (AI->getOperation() == AtomicRMWInst::And)
31904 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31907
31908 // If atomic XOR/OR need to be setting and testing the same bit.
31909 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31912}
31913
31914void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31915 IRBuilder<> Builder(AI);
31916 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31919 switch (AI->getOperation()) {
31920 default:
31921 llvm_unreachable("Unknown atomic operation");
31922 case AtomicRMWInst::Or:
31923 IID_C = Intrinsic::x86_atomic_bts;
31924 IID_I = Intrinsic::x86_atomic_bts_rm;
31925 break;
31926 case AtomicRMWInst::Xor:
31927 IID_C = Intrinsic::x86_atomic_btc;
31928 IID_I = Intrinsic::x86_atomic_btc_rm;
31929 break;
31930 case AtomicRMWInst::And:
31931 IID_C = Intrinsic::x86_atomic_btr;
31932 IID_I = Intrinsic::x86_atomic_btr_rm;
31933 break;
31934 }
31935 Instruction *I = AI->user_back();
31936 LLVMContext &Ctx = AI->getContext();
31937 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31939 Value *Result = nullptr;
31940 auto BitTested = FindSingleBitChange(AI->getValOperand());
31941 assert(BitTested.first != nullptr);
31942
31943 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31944 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31945
31946 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31947 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31948 {Addr, Builder.getInt8(Imm)});
31949 } else {
31950 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31951
31952 Value *SI = BitTested.first;
31953 assert(SI != nullptr);
31954
31955 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31956 // mask it.
31957 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31958 Value *BitPos =
31959 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31960 // Todo(1): In many cases it may be provable that SI is less than
31961 // ShiftBits in which case this mask is unnecessary
31962 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31963 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31964 // favor of just a raw BT{S|R|C}.
31965
31966 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31967 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31968
31969 // If the result is only used for zero/non-zero status then we don't need to
31970 // shift value back. Otherwise do so.
31971 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31972 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31973 if (ICmp->isEquality()) {
31974 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31975 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31976 if (C0 || C1) {
31977 assert(C0 == nullptr || C1 == nullptr);
31978 if ((C0 ? C0 : C1)->isZero())
31979 continue;
31980 }
31981 }
31982 }
31983 Result = Builder.CreateShl(Result, BitPos);
31984 break;
31985 }
31986 }
31987
31988 I->replaceAllUsesWith(Result);
31989 I->eraseFromParent();
31990 AI->eraseFromParent();
31991}
31992
31994 using namespace llvm::PatternMatch;
31995 if (!AI->hasOneUse())
31996 return false;
31997
31998 Value *Op = AI->getOperand(1);
31999 CmpPredicate Pred;
32000 Instruction *I = AI->user_back();
32002 if (Opc == AtomicRMWInst::Add) {
32003 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32004 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32005 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32006 if (match(I->user_back(),
32008 return true;
32009 if (match(I->user_back(),
32011 return true;
32012 }
32013 return false;
32014 }
32015 if (Opc == AtomicRMWInst::Sub) {
32016 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32017 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32018 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32019 if (match(I->user_back(),
32021 return true;
32022 if (match(I->user_back(),
32024 return true;
32025 }
32026 return false;
32027 }
32028 if ((Opc == AtomicRMWInst::Or &&
32030 (Opc == AtomicRMWInst::And &&
32032 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32033 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32034 Pred == CmpInst::ICMP_SLT;
32035 if (match(I->user_back(),
32037 return true;
32038 return false;
32039 }
32040 if (Opc == AtomicRMWInst::Xor) {
32041 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32042 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32043 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32044 if (match(I->user_back(),
32046 return true;
32047 if (match(I->user_back(),
32049 return true;
32050 }
32051 return false;
32052 }
32053
32054 return false;
32055}
32056
32057void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32058 AtomicRMWInst *AI) const {
32059 IRBuilder<> Builder(AI);
32060 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32061 Instruction *TempI = nullptr;
32062 LLVMContext &Ctx = AI->getContext();
32063 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32064 if (!ICI) {
32065 TempI = AI->user_back();
32066 assert(TempI->hasOneUse() && "Must have one use");
32067 ICI = cast<ICmpInst>(TempI->user_back());
32068 }
32070 ICmpInst::Predicate Pred = ICI->getPredicate();
32071 switch (Pred) {
32072 default:
32073 llvm_unreachable("Not supported Pred");
32074 case CmpInst::ICMP_EQ:
32075 CC = X86::COND_E;
32076 break;
32077 case CmpInst::ICMP_NE:
32078 CC = X86::COND_NE;
32079 break;
32080 case CmpInst::ICMP_SLT:
32081 CC = X86::COND_S;
32082 break;
32083 case CmpInst::ICMP_SGT:
32084 CC = X86::COND_NS;
32085 break;
32086 }
32088 switch (AI->getOperation()) {
32089 default:
32090 llvm_unreachable("Unknown atomic operation");
32091 case AtomicRMWInst::Add:
32092 IID = Intrinsic::x86_atomic_add_cc;
32093 break;
32094 case AtomicRMWInst::Sub:
32095 IID = Intrinsic::x86_atomic_sub_cc;
32096 break;
32097 case AtomicRMWInst::Or:
32098 IID = Intrinsic::x86_atomic_or_cc;
32099 break;
32100 case AtomicRMWInst::And:
32101 IID = Intrinsic::x86_atomic_and_cc;
32102 break;
32103 case AtomicRMWInst::Xor:
32104 IID = Intrinsic::x86_atomic_xor_cc;
32105 break;
32106 }
32107 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32109 Value *Call = Builder.CreateIntrinsic(
32110 IID, AI->getType(),
32111 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32112 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32113 ICI->replaceAllUsesWith(Result);
32114 ICI->eraseFromParent();
32115 if (TempI)
32116 TempI->eraseFromParent();
32117 AI->eraseFromParent();
32118}
32119
32121X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32122 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32123 Type *MemType = AI->getType();
32124
32125 // If the operand is too big, we must see if cmpxchg8/16b is available
32126 // and default to library calls otherwise.
32127 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32128 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32130 }
32131
32133 switch (Op) {
32136 case AtomicRMWInst::Add:
32137 case AtomicRMWInst::Sub:
32140 // It's better to use xadd, xsub or xchg for these in other cases.
32142 case AtomicRMWInst::Or:
32143 case AtomicRMWInst::And:
32144 case AtomicRMWInst::Xor:
32147 return shouldExpandLogicAtomicRMWInIR(AI);
32149 case AtomicRMWInst::Max:
32150 case AtomicRMWInst::Min:
32161 default:
32162 // These always require a non-trivial set of data operations on x86. We must
32163 // use a cmpxchg loop.
32165 }
32166}
32167
32168LoadInst *
32169X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32170 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32171 Type *MemType = AI->getType();
32172 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32173 // there is no benefit in turning such RMWs into loads, and it is actually
32174 // harmful as it introduces a mfence.
32175 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32176 return nullptr;
32177
32178 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32179 // lowering available in lowerAtomicArith.
32180 // TODO: push more cases through this path.
32181 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32182 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32183 AI->use_empty())
32184 return nullptr;
32185
32186 IRBuilder<> Builder(AI);
32187 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32188 auto SSID = AI->getSyncScopeID();
32189 // We must restrict the ordering to avoid generating loads with Release or
32190 // ReleaseAcquire orderings.
32192
32193 // Before the load we need a fence. Here is an example lifted from
32194 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32195 // is required:
32196 // Thread 0:
32197 // x.store(1, relaxed);
32198 // r1 = y.fetch_add(0, release);
32199 // Thread 1:
32200 // y.fetch_add(42, acquire);
32201 // r2 = x.load(relaxed);
32202 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32203 // lowered to just a load without a fence. A mfence flushes the store buffer,
32204 // making the optimization clearly correct.
32205 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32206 // otherwise, we might be able to be more aggressive on relaxed idempotent
32207 // rmw. In practice, they do not look useful, so we don't try to be
32208 // especially clever.
32209
32210 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32211 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32212 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32213
32214 // Finally we can emit the atomic load.
32215 LoadInst *Loaded = Builder.CreateAlignedLoad(
32216 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32217 Loaded->setAtomic(Order, SSID);
32218 AI->replaceAllUsesWith(Loaded);
32219 AI->eraseFromParent();
32220 return Loaded;
32221}
32222
32223/// Emit a locked operation on a stack location which does not change any
32224/// memory location, but does involve a lock prefix. Location is chosen to be
32225/// a) very likely accessed only by a single thread to minimize cache traffic,
32226/// and b) definitely dereferenceable. Returns the new Chain result.
32228 const X86Subtarget &Subtarget, SDValue Chain,
32229 const SDLoc &DL) {
32230 // Implementation notes:
32231 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32232 // operations issued by the current processor. As such, the location
32233 // referenced is not relevant for the ordering properties of the instruction.
32234 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32235 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32236 // 2) Using an immediate operand appears to be the best encoding choice
32237 // here since it doesn't require an extra register.
32238 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32239 // is small enough it might just be measurement noise.)
32240 // 4) When choosing offsets, there are several contributing factors:
32241 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32242 // line aligned stack object to improve this case.)
32243 // b) To minimize our chances of introducing a false dependence, we prefer
32244 // to offset the stack usage from TOS slightly.
32245 // c) To minimize concerns about cross thread stack usage - in particular,
32246 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32247 // captures state in the TOS frame and accesses it from many threads -
32248 // we want to use an offset such that the offset is in a distinct cache
32249 // line from the TOS frame.
32250 //
32251 // For a general discussion of the tradeoffs and benchmark results, see:
32252 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32253
32254 auto &MF = DAG.getMachineFunction();
32255 auto &TFL = *Subtarget.getFrameLowering();
32256 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32257
32258 if (Subtarget.is64Bit()) {
32259 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32260 SDValue Ops[] = {
32261 DAG.getRegister(X86::RSP, MVT::i64), // Base
32262 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32263 DAG.getRegister(0, MVT::i64), // Index
32264 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32265 DAG.getRegister(0, MVT::i16), // Segment.
32266 Zero,
32267 Chain};
32268 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32269 MVT::Other, Ops);
32270 return SDValue(Res, 1);
32271 }
32272
32273 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32274 SDValue Ops[] = {
32275 DAG.getRegister(X86::ESP, MVT::i32), // Base
32276 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32277 DAG.getRegister(0, MVT::i32), // Index
32278 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32279 DAG.getRegister(0, MVT::i16), // Segment.
32280 Zero,
32281 Chain
32282 };
32283 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32284 MVT::Other, Ops);
32285 return SDValue(Res, 1);
32286}
32287
32289 SelectionDAG &DAG) {
32290 SDLoc dl(Op);
32291 AtomicOrdering FenceOrdering =
32292 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32293 SyncScope::ID FenceSSID =
32294 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32295
32296 // The only fence that needs an instruction is a sequentially-consistent
32297 // cross-thread fence.
32298 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32299 FenceSSID == SyncScope::System) {
32300 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32301 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32302
32303 SDValue Chain = Op.getOperand(0);
32304 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32305 }
32306
32307 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32308 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32309}
32310
32312 SelectionDAG &DAG) {
32313 MVT T = Op.getSimpleValueType();
32314 SDLoc DL(Op);
32315 unsigned Reg = 0;
32316 unsigned size = 0;
32317 switch(T.SimpleTy) {
32318 default: llvm_unreachable("Invalid value type!");
32319 case MVT::i8: Reg = X86::AL; size = 1; break;
32320 case MVT::i16: Reg = X86::AX; size = 2; break;
32321 case MVT::i32: Reg = X86::EAX; size = 4; break;
32322 case MVT::i64:
32323 assert(Subtarget.is64Bit() && "Node not type legal!");
32324 Reg = X86::RAX; size = 8;
32325 break;
32326 }
32327 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32328 Op.getOperand(2), SDValue());
32329 SDValue Ops[] = { cpIn.getValue(0),
32330 Op.getOperand(1),
32331 Op.getOperand(3),
32332 DAG.getTargetConstant(size, DL, MVT::i8),
32333 cpIn.getValue(1) };
32334 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32335 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32337 Ops, T, MMO);
32338
32339 SDValue cpOut =
32340 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32341 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32342 MVT::i32, cpOut.getValue(2));
32343 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32344
32345 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32346 cpOut, Success, EFLAGS.getValue(1));
32347}
32348
32349// Create MOVMSKB, taking into account whether we need to split for AVX1.
32351 const X86Subtarget &Subtarget) {
32352 MVT InVT = V.getSimpleValueType();
32353
32354 if (InVT == MVT::v64i8) {
32355 SDValue Lo, Hi;
32356 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32357 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32358 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32359 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32360 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32361 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32362 DAG.getConstant(32, DL, MVT::i8));
32363 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32364 }
32365 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32366 SDValue Lo, Hi;
32367 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32368 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32369 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32370 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32371 DAG.getConstant(16, DL, MVT::i8));
32372 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32373 }
32374
32375 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32376}
32377
32378static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32379 SelectionDAG &DAG) {
32380 SDValue Src = Op.getOperand(0);
32381 MVT SrcVT = Src.getSimpleValueType();
32382 MVT DstVT = Op.getSimpleValueType();
32383
32384 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32385 // half to v32i1 and concatenating the result.
32386 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32387 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32388 assert(Subtarget.hasBWI() && "Expected BWI target");
32389 SDLoc dl(Op);
32390 SDValue Lo, Hi;
32391 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32392 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32393 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32394 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32395 }
32396
32397 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32398 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32399 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32400 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32401 SDLoc DL(Op);
32402 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32403 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32404 return DAG.getZExtOrTrunc(V, DL, DstVT);
32405 }
32406
32407 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32408 SrcVT == MVT::i64) && "Unexpected VT!");
32409
32410 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32411 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32412 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32413 // This conversion needs to be expanded.
32414 return SDValue();
32415
32416 SDLoc dl(Op);
32417 if (SrcVT.isVector()) {
32418 // Widen the vector in input in the case of MVT::v2i32.
32419 // Example: from MVT::v2i32 to MVT::v4i32.
32421 SrcVT.getVectorNumElements() * 2);
32422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32423 DAG.getUNDEF(SrcVT));
32424 } else {
32425 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32426 "Unexpected source type in LowerBITCAST");
32427 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32428 }
32429
32430 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32431 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32432
32433 if (DstVT == MVT::x86mmx)
32434 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32435
32436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32437 DAG.getVectorIdxConstant(0, dl));
32438}
32439
32440/// Compute the horizontal sum of bytes in V for the elements of VT.
32441///
32442/// Requires V to be a byte vector and VT to be an integer vector type with
32443/// wider elements than V's type. The width of the elements of VT determines
32444/// how many bytes of V are summed horizontally to produce each element of the
32445/// result.
32447 const X86Subtarget &Subtarget,
32448 SelectionDAG &DAG) {
32449 SDLoc DL(V);
32450 MVT ByteVecVT = V.getSimpleValueType();
32451 MVT EltVT = VT.getVectorElementType();
32452 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32453 "Expected value to have byte element type.");
32454 assert(EltVT != MVT::i8 &&
32455 "Horizontal byte sum only makes sense for wider elements!");
32456 unsigned VecSize = VT.getSizeInBits();
32457 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32458
32459 // PSADBW instruction horizontally add all bytes and leave the result in i64
32460 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32461 if (EltVT == MVT::i64) {
32462 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32463 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32464 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32465 return DAG.getBitcast(VT, V);
32466 }
32467
32468 if (EltVT == MVT::i32) {
32469 // We unpack the low half and high half into i32s interleaved with zeros so
32470 // that we can use PSADBW to horizontally sum them. The most useful part of
32471 // this is that it lines up the results of two PSADBW instructions to be
32472 // two v2i64 vectors which concatenated are the 4 population counts. We can
32473 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32474 SDValue Zeros = DAG.getConstant(0, DL, VT);
32475 SDValue V32 = DAG.getBitcast(VT, V);
32476 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32477 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32478
32479 // Do the horizontal sums into two v2i64s.
32480 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32481 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32482 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32483 DAG.getBitcast(ByteVecVT, Low), Zeros);
32484 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32485 DAG.getBitcast(ByteVecVT, High), Zeros);
32486
32487 // Merge them together.
32488 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32489 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32490 DAG.getBitcast(ShortVecVT, Low),
32491 DAG.getBitcast(ShortVecVT, High));
32492
32493 return DAG.getBitcast(VT, V);
32494 }
32495
32496 // The only element type left is i16.
32497 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32498
32499 // To obtain pop count for each i16 element starting from the pop count for
32500 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32501 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32502 // directly supported.
32503 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32504 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32505 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32506 DAG.getBitcast(ByteVecVT, V));
32507 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32508}
32509
32511 const X86Subtarget &Subtarget,
32512 SelectionDAG &DAG) {
32513 MVT VT = Op.getSimpleValueType();
32514 MVT EltVT = VT.getVectorElementType();
32515 int NumElts = VT.getVectorNumElements();
32516 (void)EltVT;
32517 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32518
32519 // Implement a lookup table in register by using an algorithm based on:
32520 // http://wm.ite.pl/articles/sse-popcount.html
32521 //
32522 // The general idea is that every lower byte nibble in the input vector is an
32523 // index into a in-register pre-computed pop count table. We then split up the
32524 // input vector in two new ones: (1) a vector with only the shifted-right
32525 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32526 // masked out higher ones) for each byte. PSHUFB is used separately with both
32527 // to index the in-register table. Next, both are added and the result is a
32528 // i8 vector where each element contains the pop count for input byte.
32529 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32530 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32531 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32532 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32533
32535 for (int i = 0; i < NumElts; ++i)
32536 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32537 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32538 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32539
32540 // High nibbles
32541 SDValue FourV = DAG.getConstant(4, DL, VT);
32542 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32543
32544 // Low nibbles
32545 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32546
32547 // The input vector is used as the shuffle mask that index elements into the
32548 // LUT. After counting low and high nibbles, add the vector to obtain the
32549 // final pop count per i8 element.
32550 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32551 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32552 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32553}
32554
32555// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32556// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32558 const X86Subtarget &Subtarget,
32559 SelectionDAG &DAG) {
32560 MVT VT = Op.getSimpleValueType();
32561 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32562 "Unknown CTPOP type to handle");
32563 SDValue Op0 = Op.getOperand(0);
32564
32565 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32566 if (Subtarget.hasVPOPCNTDQ()) {
32567 unsigned NumElems = VT.getVectorNumElements();
32568 assert((VT.getVectorElementType() == MVT::i8 ||
32569 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32570 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32571 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32572 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32573 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32574 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32575 }
32576 }
32577
32578 // Decompose 256-bit ops into smaller 128-bit ops.
32579 if (VT.is256BitVector() && !Subtarget.hasInt256())
32580 return splitVectorIntUnary(Op, DAG, DL);
32581
32582 // Decompose 512-bit ops into smaller 256-bit ops.
32583 if (VT.is512BitVector() && !Subtarget.hasBWI())
32584 return splitVectorIntUnary(Op, DAG, DL);
32585
32586 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32587 if (VT.getScalarType() != MVT::i8) {
32588 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32589 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32590 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32591 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32592 }
32593
32594 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32595 if (!Subtarget.hasSSSE3())
32596 return SDValue();
32597
32598 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32599}
32600
32601static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32602 SelectionDAG &DAG) {
32603 MVT VT = N.getSimpleValueType();
32604 SDValue Op = N.getOperand(0);
32605 SDLoc DL(N);
32606
32607 if (VT.isScalarInteger()) {
32608 // Compute the lower/upper bounds of the active bits of the value,
32609 // allowing us to shift the active bits down if necessary to fit into the
32610 // special cases below.
32611 KnownBits Known = DAG.computeKnownBits(Op);
32612 if (Known.isConstant())
32613 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32614 unsigned LZ = Known.countMinLeadingZeros();
32615 unsigned TZ = Known.countMinTrailingZeros();
32616 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32617 unsigned ActiveBits = Known.getBitWidth() - LZ;
32618 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32619
32620 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32621 if (ShiftedActiveBits <= 2) {
32622 if (ActiveBits > 2)
32623 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32624 DAG.getShiftAmountConstant(TZ, VT, DL));
32625 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32626 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32627 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32628 DAG.getShiftAmountConstant(1, VT, DL)));
32629 return DAG.getZExtOrTrunc(Op, DL, VT);
32630 }
32631
32632 // i3 CTPOP - perform LUT into i32 integer.
32633 if (ShiftedActiveBits <= 3) {
32634 if (ActiveBits > 3)
32635 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32636 DAG.getShiftAmountConstant(TZ, VT, DL));
32637 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32638 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32639 DAG.getShiftAmountConstant(1, VT, DL));
32640 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32641 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32642 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32643 DAG.getConstant(0x3, DL, MVT::i32));
32644 return DAG.getZExtOrTrunc(Op, DL, VT);
32645 }
32646
32647 // i4 CTPOP - perform LUT into i64 integer.
32648 if (ShiftedActiveBits <= 4 &&
32649 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32650 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32651 if (ActiveBits > 4)
32652 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32653 DAG.getShiftAmountConstant(TZ, VT, DL));
32654 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32655 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32656 DAG.getConstant(4, DL, MVT::i32));
32657 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32658 DAG.getShiftAmountOperand(MVT::i64, Op));
32659 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32660 DAG.getConstant(0x7, DL, MVT::i64));
32661 return DAG.getZExtOrTrunc(Op, DL, VT);
32662 }
32663
32664 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32665 if (ShiftedActiveBits <= 8) {
32666 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32667 if (ActiveBits > 8)
32668 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32669 DAG.getShiftAmountConstant(TZ, VT, DL));
32670 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32671 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32672 DAG.getConstant(0x08040201U, DL, MVT::i32));
32673 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32674 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32675 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32676 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32677 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32678 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32679 return DAG.getZExtOrTrunc(Op, DL, VT);
32680 }
32681
32682 return SDValue(); // fallback to generic expansion.
32683 }
32684
32685 assert(VT.isVector() &&
32686 "We only do custom lowering for vector population count.");
32687 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32688}
32689
32691 MVT VT = Op.getSimpleValueType();
32692 SDValue In = Op.getOperand(0);
32693 SDLoc DL(Op);
32694
32695 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32696 // perform the BITREVERSE.
32697 if (!VT.isVector()) {
32698 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32699 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32700 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32701 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32702 DAG.getVectorIdxConstant(0, DL));
32703 }
32704
32705 int NumElts = VT.getVectorNumElements();
32706 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32707
32708 // Decompose 256-bit ops into smaller 128-bit ops.
32709 if (VT.is256BitVector())
32710 return splitVectorIntUnary(Op, DAG, DL);
32711
32712 assert(VT.is128BitVector() &&
32713 "Only 128-bit vector bitreverse lowering supported.");
32714
32715 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32716 // perform the BSWAP in the shuffle.
32717 // Its best to shuffle using the second operand as this will implicitly allow
32718 // memory folding for multiple vectors.
32719 SmallVector<SDValue, 16> MaskElts;
32720 for (int i = 0; i != NumElts; ++i) {
32721 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32722 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32723 int PermuteByte = SourceByte | (2 << 5);
32724 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32725 }
32726 }
32727
32728 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32729 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32730 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32731 Res, Mask);
32732 return DAG.getBitcast(VT, Res);
32733}
32734
32736 SelectionDAG &DAG) {
32737 MVT VT = Op.getSimpleValueType();
32738
32739 if (Subtarget.hasXOP() && !VT.is512BitVector())
32740 return LowerBITREVERSE_XOP(Op, DAG);
32741
32742 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32743 "SSSE3 or GFNI required for BITREVERSE");
32744
32745 SDValue In = Op.getOperand(0);
32746 SDLoc DL(Op);
32747
32748 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32749 if (VT.is512BitVector() && !Subtarget.hasBWI())
32750 return splitVectorIntUnary(Op, DAG, DL);
32751
32752 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32753 if (VT.is256BitVector() && !Subtarget.hasInt256())
32754 return splitVectorIntUnary(Op, DAG, DL);
32755
32756 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32757 if (!VT.isVector()) {
32758 assert(
32759 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32760 "Only tested for i8/i16/i32/i64");
32761 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32762 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32763 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32764 DAG.getBitcast(MVT::v16i8, Res));
32765 Res =
32766 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32767 DAG.getVectorIdxConstant(0, DL));
32768 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32769 }
32770
32771 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32772
32773 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32774 if (VT.getScalarType() != MVT::i8) {
32775 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32776 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32777 Res = DAG.getBitcast(ByteVT, Res);
32778 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32779 return DAG.getBitcast(VT, Res);
32780 }
32781 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32782 "Only byte vector BITREVERSE supported");
32783
32784 unsigned NumElts = VT.getVectorNumElements();
32785
32786 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32787 if (Subtarget.hasGFNI()) {
32789 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32790 DAG.getTargetConstant(0, DL, MVT::i8));
32791 }
32792
32793 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32794 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32795 // 0-15 value (moved to the other nibble).
32796 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32797 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32798 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32799
32800 const int LoLUT[16] = {
32801 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32802 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32803 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32804 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32805 const int HiLUT[16] = {
32806 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32807 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32808 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32809 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32810
32811 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32812 for (unsigned i = 0; i < NumElts; ++i) {
32813 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32814 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32815 }
32816
32817 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32818 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32819 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32820 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32821 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32822}
32823
32824static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32825 SelectionDAG &DAG) {
32826 SDLoc DL(Op);
32827 SDValue X = Op.getOperand(0);
32828 MVT VT = Op.getSimpleValueType();
32829
32830 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32831 if (VT == MVT::i8 ||
32833 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32834 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32835 DAG.getConstant(0, DL, MVT::i8));
32836 // Copy the inverse of the parity flag into a register with setcc.
32837 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32838 // Extend to the original type.
32839 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32840 }
32841
32842 // If we have POPCNT, use the default expansion.
32843 if (Subtarget.hasPOPCNT())
32844 return SDValue();
32845
32846 if (VT == MVT::i64) {
32847 // Xor the high and low 16-bits together using a 32-bit operation.
32848 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32849 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32850 DAG.getConstant(32, DL, MVT::i8)));
32851 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32852 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32853 }
32854
32855 if (VT != MVT::i16) {
32856 // Xor the high and low 16-bits together using a 32-bit operation.
32857 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32858 DAG.getConstant(16, DL, MVT::i8));
32859 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32860 } else {
32861 // If the input is 16-bits, we need to extend to use an i32 shift below.
32862 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32863 }
32864
32865 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32866 // This should allow an h-reg to be used to save a shift.
32867 SDValue Hi = DAG.getNode(
32868 ISD::TRUNCATE, DL, MVT::i8,
32869 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32870 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32871 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32872 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32873
32874 // Copy the inverse of the parity flag into a register with setcc.
32875 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32876 // Extend to the original type.
32877 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32878}
32879
32881 const X86Subtarget &Subtarget) {
32882 unsigned NewOpc = 0;
32883 switch (N->getOpcode()) {
32884 case ISD::ATOMIC_LOAD_ADD:
32885 NewOpc = X86ISD::LADD;
32886 break;
32887 case ISD::ATOMIC_LOAD_SUB:
32888 NewOpc = X86ISD::LSUB;
32889 break;
32890 case ISD::ATOMIC_LOAD_OR:
32891 NewOpc = X86ISD::LOR;
32892 break;
32893 case ISD::ATOMIC_LOAD_XOR:
32894 NewOpc = X86ISD::LXOR;
32895 break;
32896 case ISD::ATOMIC_LOAD_AND:
32897 NewOpc = X86ISD::LAND;
32898 break;
32899 default:
32900 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32901 }
32902
32903 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32904
32905 return DAG.getMemIntrinsicNode(
32906 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32907 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32908 /*MemVT=*/N->getSimpleValueType(0), MMO);
32909}
32910
32911/// Lower atomic_load_ops into LOCK-prefixed operations.
32913 const X86Subtarget &Subtarget) {
32914 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32915 SDValue Chain = N->getOperand(0);
32916 SDValue LHS = N->getOperand(1);
32917 SDValue RHS = N->getOperand(2);
32918 unsigned Opc = N->getOpcode();
32919 MVT VT = N->getSimpleValueType(0);
32920 SDLoc DL(N);
32921
32922 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32923 // can only be lowered when the result is unused. They should have already
32924 // been transformed into a cmpxchg loop in AtomicExpand.
32925 if (N->hasAnyUseOfValue(0)) {
32926 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32927 // select LXADD if LOCK_SUB can't be selected.
32928 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32929 // can use LXADD as opposed to cmpxchg.
32930 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32931 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32932 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32933 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32934
32935 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32936 "Used AtomicRMW ops other than Add should have been expanded!");
32937 return N;
32938 }
32939
32940 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32941 // The core idea here is that since the memory location isn't actually
32942 // changing, all we need is a lowering for the *ordering* impacts of the
32943 // atomicrmw. As such, we can chose a different operation and memory
32944 // location to minimize impact on other code.
32945 // The above holds unless the node is marked volatile in which
32946 // case it needs to be preserved according to the langref.
32947 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32948 // On X86, the only ordering which actually requires an instruction is
32949 // seq_cst which isn't SingleThread, everything just needs to be preserved
32950 // during codegen and then dropped. Note that we expect (but don't assume),
32951 // that orderings other than seq_cst and acq_rel have been canonicalized to
32952 // a store or load.
32955 // Prefer a locked operation against a stack location to minimize cache
32956 // traffic. This assumes that stack locations are very likely to be
32957 // accessed only by the owning thread.
32958 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32959 assert(!N->hasAnyUseOfValue(0));
32960 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32961 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32962 DAG.getUNDEF(VT), NewChain);
32963 }
32964 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32965 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32966 assert(!N->hasAnyUseOfValue(0));
32967 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32968 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32969 DAG.getUNDEF(VT), NewChain);
32970 }
32971
32972 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32973 // RAUW the chain, but don't worry about the result, as it's unused.
32974 assert(!N->hasAnyUseOfValue(0));
32975 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32976 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32977 DAG.getUNDEF(VT), LockOp.getValue(1));
32978}
32979
32981 const X86Subtarget &Subtarget) {
32982 auto *Node = cast<AtomicSDNode>(Op.getNode());
32983 SDLoc dl(Node);
32984 EVT VT = Node->getMemoryVT();
32985
32986 bool IsSeqCst =
32987 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32988 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32989
32990 // If this store is not sequentially consistent and the type is legal
32991 // we can just keep it.
32992 if (!IsSeqCst && IsTypeLegal)
32993 return Op;
32994
32995 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32997 Attribute::NoImplicitFloat)) {
32998 SDValue Chain;
32999 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33000 // vector store.
33001 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33002 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33003 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33004 Node->getMemOperand());
33005 }
33006
33007 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33008 // is enabled.
33009 if (VT == MVT::i64) {
33010 if (Subtarget.hasSSE1()) {
33011 SDValue SclToVec =
33012 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33013 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33014 SclToVec = DAG.getBitcast(StVT, SclToVec);
33015 SDVTList Tys = DAG.getVTList(MVT::Other);
33016 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33017 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33018 MVT::i64, Node->getMemOperand());
33019 } else if (Subtarget.hasX87()) {
33020 // First load this into an 80-bit X87 register using a stack temporary.
33021 // This will put the whole integer into the significand.
33022 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33023 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33024 MachinePointerInfo MPI =
33026 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33028 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33029 SDValue LdOps[] = {Chain, StackPtr};
33031 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33032 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33033 Chain = Value.getValue(1);
33034
33035 // Now use an FIST to do the atomic store.
33036 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33037 Chain =
33038 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33039 StoreOps, MVT::i64, Node->getMemOperand());
33040 }
33041 }
33042
33043 if (Chain) {
33044 // If this is a sequentially consistent store, also emit an appropriate
33045 // barrier.
33046 if (IsSeqCst)
33047 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33048
33049 return Chain;
33050 }
33051 }
33052
33053 // Convert seq_cst store -> xchg
33054 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33055 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33056 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33057 Node->getOperand(0), Node->getOperand(2),
33058 Node->getOperand(1), Node->getMemOperand());
33059 return Swap.getValue(1);
33060}
33061
33063 SDNode *N = Op.getNode();
33064 MVT VT = N->getSimpleValueType(0);
33065 unsigned Opc = Op.getOpcode();
33066
33067 // Let legalize expand this if it isn't a legal type yet.
33068 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33069 return SDValue();
33070
33071 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33072 SDLoc DL(N);
33073
33074 // Set the carry flag.
33075 SDValue Carry = Op.getOperand(2);
33076 EVT CarryVT = Carry.getValueType();
33077 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33078 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33079
33080 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33081 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33082 Op.getOperand(0), Op.getOperand(1),
33083 Carry.getValue(1));
33084
33085 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33086 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33087 Sum.getValue(1), DL, DAG);
33088 if (N->getValueType(1) == MVT::i1)
33089 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33090
33091 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33092}
33093
33094static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33095 SelectionDAG &DAG) {
33096 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33097
33098 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33099 // which returns the values as { float, float } (in XMM0) or
33100 // { double, double } (which is returned in XMM0, XMM1).
33101 SDLoc dl(Op);
33102 SDValue Arg = Op.getOperand(0);
33103 EVT ArgVT = Arg.getValueType();
33104 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33105
33107 Args.emplace_back(Arg, ArgTy);
33108
33109 bool isF64 = ArgVT == MVT::f64;
33110 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33111 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33112 // the results are returned via SRet in memory.
33113 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33114 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33115 const char *LibcallName = TLI.getLibcallName(LC);
33116 SDValue Callee =
33117 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33118
33119 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33120 : (Type *)FixedVectorType::get(ArgTy, 4);
33121
33123 CLI.setDebugLoc(dl)
33124 .setChain(DAG.getEntryNode())
33125 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33126
33127 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33128
33129 if (isF64)
33130 // Returned in xmm0 and xmm1.
33131 return CallResult.first;
33132
33133 // Returned in bits 0:31 and 32:64 xmm0.
33134 SDValue SinVal =
33135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33136 DAG.getVectorIdxConstant(0, dl));
33137 SDValue CosVal =
33138 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33139 DAG.getVectorIdxConstant(1, dl));
33140 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33141 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33142}
33143
33144/// Widen a vector input to a vector of NVT. The
33145/// input vector must have the same element type as NVT.
33147 bool FillWithZeroes = false) {
33148 // Check if InOp already has the right width.
33149 MVT InVT = InOp.getSimpleValueType();
33150 if (InVT == NVT)
33151 return InOp;
33152
33153 if (InOp.isUndef())
33154 return DAG.getUNDEF(NVT);
33155
33157 "input and widen element type must match");
33158
33159 unsigned InNumElts = InVT.getVectorNumElements();
33160 unsigned WidenNumElts = NVT.getVectorNumElements();
33161 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33162 "Unexpected request for vector widening");
33163
33164 SDLoc dl(InOp);
33165 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33166 SDValue N1 = InOp.getOperand(1);
33167 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33168 N1.isUndef()) {
33169 InOp = InOp.getOperand(0);
33170 InVT = InOp.getSimpleValueType();
33171 InNumElts = InVT.getVectorNumElements();
33172 }
33173 }
33176 EVT EltVT = InOp.getOperand(0).getValueType();
33177 SDValue FillVal =
33178 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33180 Ops.append(WidenNumElts - InNumElts, FillVal);
33181 return DAG.getBuildVector(NVT, dl, Ops);
33182 }
33183 SDValue FillVal =
33184 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33185 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33186 DAG.getVectorIdxConstant(0, dl));
33187}
33188
33190 SelectionDAG &DAG) {
33191 assert(Subtarget.hasAVX512() &&
33192 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33193
33195 SDValue Src = N->getValue();
33196 MVT VT = Src.getSimpleValueType();
33197 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33198 SDLoc dl(Op);
33199
33200 SDValue Scale = N->getScale();
33201 SDValue Index = N->getIndex();
33202 SDValue Mask = N->getMask();
33203 SDValue Chain = N->getChain();
33204 SDValue BasePtr = N->getBasePtr();
33205
33206 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33207 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33208 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33209 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33210 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33211 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33212 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33213 SDVTList VTs = DAG.getVTList(MVT::Other);
33214 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33215 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33216 N->getMemoryVT(), N->getMemOperand());
33217 }
33218 return SDValue();
33219 }
33220
33221 MVT IndexVT = Index.getSimpleValueType();
33222
33223 // If the index is v2i32, we're being called by type legalization and we
33224 // should just let the default handling take care of it.
33225 if (IndexVT == MVT::v2i32)
33226 return SDValue();
33227
33228 // If we don't have VLX and neither the passthru or index is 512-bits, we
33229 // need to widen until one is.
33230 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33231 !Index.getSimpleValueType().is512BitVector()) {
33232 // Determine how much we need to widen by to get a 512-bit type.
33233 unsigned Factor = std::min(512/VT.getSizeInBits(),
33234 512/IndexVT.getSizeInBits());
33235 unsigned NumElts = VT.getVectorNumElements() * Factor;
33236
33237 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33238 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33239 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33240
33241 Src = ExtendToType(Src, VT, DAG);
33242 Index = ExtendToType(Index, IndexVT, DAG);
33243 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33244 }
33245
33246 SDVTList VTs = DAG.getVTList(MVT::Other);
33247 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33248 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33249 N->getMemoryVT(), N->getMemOperand());
33250}
33251
33252static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33253 SelectionDAG &DAG) {
33254
33256 MVT VT = Op.getSimpleValueType();
33257 MVT ScalarVT = VT.getScalarType();
33258 SDValue Mask = N->getMask();
33259 MVT MaskVT = Mask.getSimpleValueType();
33260 SDValue PassThru = N->getPassThru();
33261 SDLoc dl(Op);
33262
33263 // Handle AVX masked loads which don't support passthru other than 0.
33264 if (MaskVT.getVectorElementType() != MVT::i1) {
33265 // We also allow undef in the isel pattern.
33266 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33267 return Op;
33268
33269 SDValue NewLoad = DAG.getMaskedLoad(
33270 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33271 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33272 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33273 N->isExpandingLoad());
33274 // Emit a blend.
33275 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33276 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33277 }
33278
33279 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33280 "Expanding masked load is supported on AVX-512 target only!");
33281
33282 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33283 "Expanding masked load is supported for 32 and 64-bit types only!");
33284
33285 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33286 "Cannot lower masked load op.");
33287
33288 assert((ScalarVT.getSizeInBits() >= 32 ||
33289 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33290 ScalarVT == MVT::f16))) &&
33291 "Unsupported masked load op.");
33292
33293 // This operation is legal for targets with VLX, but without
33294 // VLX the vector should be widened to 512 bit
33295 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33296 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33297 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33298
33299 // Mask element has to be i1.
33300 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33301 "Unexpected mask type");
33302
33303 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33304
33305 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33306 SDValue NewLoad = DAG.getMaskedLoad(
33307 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33308 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33309 N->getExtensionType(), N->isExpandingLoad());
33310
33311 SDValue Extract =
33312 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33313 DAG.getVectorIdxConstant(0, dl));
33314 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33315 return DAG.getMergeValues(RetOps, dl);
33316}
33317
33318static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33319 SelectionDAG &DAG) {
33321 SDValue DataToStore = N->getValue();
33322 MVT VT = DataToStore.getSimpleValueType();
33323 MVT ScalarVT = VT.getScalarType();
33324 SDValue Mask = N->getMask();
33325 SDLoc dl(Op);
33326
33327 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33328 "Expanding masked load is supported on AVX-512 target only!");
33329
33330 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33331 "Expanding masked load is supported for 32 and 64-bit types only!");
33332
33333 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33334 "Cannot lower masked store op.");
33335
33336 assert((ScalarVT.getSizeInBits() >= 32 ||
33337 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33338 ScalarVT == MVT::f16))) &&
33339 "Unsupported masked store op.");
33340
33341 // This operation is legal for targets with VLX, but without
33342 // VLX the vector should be widened to 512 bit
33343 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33344 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33345
33346 // Mask element has to be i1.
33347 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33348 "Unexpected mask type");
33349
33350 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33351
33352 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33353 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33354 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33355 N->getOffset(), Mask, N->getMemoryVT(),
33356 N->getMemOperand(), N->getAddressingMode(),
33357 N->isTruncatingStore(), N->isCompressingStore());
33358}
33359
33360static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33361 SelectionDAG &DAG) {
33362 assert(Subtarget.hasAVX2() &&
33363 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33364
33366 SDLoc dl(Op);
33367 MVT VT = Op.getSimpleValueType();
33368 SDValue Index = N->getIndex();
33369 SDValue Mask = N->getMask();
33370 SDValue PassThru = N->getPassThru();
33371 MVT IndexVT = Index.getSimpleValueType();
33372
33373 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33374
33375 // If the index is v2i32, we're being called by type legalization.
33376 if (IndexVT == MVT::v2i32)
33377 return SDValue();
33378
33379 // If we don't have VLX and neither the passthru or index is 512-bits, we
33380 // need to widen until one is.
33381 MVT OrigVT = VT;
33382 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33383 !IndexVT.is512BitVector()) {
33384 // Determine how much we need to widen by to get a 512-bit type.
33385 unsigned Factor = std::min(512/VT.getSizeInBits(),
33386 512/IndexVT.getSizeInBits());
33387
33388 unsigned NumElts = VT.getVectorNumElements() * Factor;
33389
33390 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33391 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33392 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33393
33394 PassThru = ExtendToType(PassThru, VT, DAG);
33395 Index = ExtendToType(Index, IndexVT, DAG);
33396 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33397 }
33398
33399 // Break dependency on the data register.
33400 if (PassThru.isUndef())
33401 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33402
33403 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33404 N->getScale() };
33405 SDValue NewGather = DAG.getMemIntrinsicNode(
33406 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33407 N->getMemOperand());
33408 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33409 DAG.getVectorIdxConstant(0, dl));
33410 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33411}
33412
33414 SDLoc dl(Op);
33415 SDValue Src = Op.getOperand(0);
33416 MVT DstVT = Op.getSimpleValueType();
33417
33419 unsigned SrcAS = N->getSrcAddressSpace();
33420
33421 assert(SrcAS != N->getDestAddressSpace() &&
33422 "addrspacecast must be between different address spaces");
33423
33424 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33425 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33426 } else if (DstVT == MVT::i64) {
33427 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33428 } else if (DstVT == MVT::i32) {
33429 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33430 } else {
33431 report_fatal_error("Bad address space in addrspacecast");
33432 }
33433 return Op;
33434}
33435
33436SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33437 SelectionDAG &DAG) const {
33438 // TODO: Eventually, the lowering of these nodes should be informed by or
33439 // deferred to the GC strategy for the function in which they appear. For
33440 // now, however, they must be lowered to something. Since they are logically
33441 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33442 // require special handling for these nodes), lower them as literal NOOPs for
33443 // the time being.
33445 Ops.push_back(Op.getOperand(0));
33446 if (Op->getGluedNode())
33447 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33448
33449 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33450 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33451}
33452
33453// Custom split CVTPS2PH with wide types.
33455 SDLoc dl(Op);
33456 EVT VT = Op.getValueType();
33457 SDValue Lo, Hi;
33458 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33459 EVT LoVT, HiVT;
33460 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33461 SDValue RC = Op.getOperand(1);
33462 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33463 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33464 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33465}
33466
33468 SelectionDAG &DAG) {
33469 unsigned IsData = Op.getConstantOperandVal(4);
33470
33471 // We don't support non-data prefetch without PREFETCHI.
33472 // Just preserve the chain.
33473 if (!IsData && !Subtarget.hasPREFETCHI())
33474 return Op.getOperand(0);
33475
33476 return Op;
33477}
33478
33480 SDNode *N = Op.getNode();
33481 SDValue Operand = N->getOperand(0);
33482 EVT VT = Operand.getValueType();
33483 SDLoc dl(N);
33484
33485 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33486
33487 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33488 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33489 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33490 // promote this operator's result!
33491 SDValue Chain = DAG.getEntryNode();
33492 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33493 {Chain, Operand, One});
33494 return StrictFmul;
33495}
33496
33498 unsigned OpNo) {
33499 const APInt Operand(32, OpNo);
33500 std::string OpNoStr = llvm::toString(Operand, 10, false);
33501 std::string Str(" $");
33502
33503 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33504 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33505
33506 auto I = StringRef::npos;
33507 for (auto &AsmStr : AsmStrs) {
33508 // Match the OpNo string. We should match exactly to exclude match
33509 // sub-string, e.g. "$12" contain "$1"
33510 if (AsmStr.ends_with(OpNoStr1))
33511 I = AsmStr.size() - OpNoStr1.size();
33512
33513 // Get the index of operand in AsmStr.
33514 if (I == StringRef::npos)
33515 I = AsmStr.find(OpNoStr1 + ",");
33516 if (I == StringRef::npos)
33517 I = AsmStr.find(OpNoStr2);
33518
33519 if (I == StringRef::npos)
33520 continue;
33521
33522 assert(I > 0 && "Unexpected inline asm string!");
33523 // Remove the operand string and label (if exsit).
33524 // For example:
33525 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33526 // ==>
33527 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33528 // ==>
33529 // "call dword ptr "
33530 auto TmpStr = AsmStr.substr(0, I);
33531 I = TmpStr.rfind(':');
33532 if (I != StringRef::npos)
33533 TmpStr = TmpStr.substr(I + 1);
33534 return TmpStr.take_while(llvm::isAlpha);
33535 }
33536
33537 return StringRef();
33538}
33539
33541 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33542 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33543 // changed from indirect TargetLowering::C_Memory to direct
33544 // TargetLowering::C_Address.
33545 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33546 // location.
33547 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33548 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33549}
33550
33552 SDValue Mask) {
33553 EVT Ty = MVT::i8;
33554 auto V = DAG.getBitcast(MVT::i1, Mask);
33555 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33556 auto Zero = DAG.getConstant(0, DL, Ty);
33557 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33558 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33559 return SDValue(CmpZero.getNode(), 1);
33560}
33561
33563 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33564 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33565 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33566 // ->
33567 // _, flags = SUB 0, mask
33568 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33569 // bit_cast_to_vector<res>
33570 EVT VTy = PassThru.getValueType();
33571 EVT Ty = VTy.getVectorElementType();
33572 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33573 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33574 : DAG.getBitcast(Ty, PassThru);
33575 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33576 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33577 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33578 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33579 return DAG.getBitcast(VTy, NewLoad);
33580}
33581
33583 SDValue Chain,
33585 SDValue Val, SDValue Mask) const {
33586 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33587 // ->
33588 // _, flags = SUB 0, mask
33589 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33591 SDVTList Tys = DAG.getVTList(MVT::Other);
33592 auto ScalarVal = DAG.getBitcast(Ty, Val);
33593 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33594 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33595 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33596 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33597}
33598
33599/// Provide custom lowering hooks for some operations.
33601 switch (Op.getOpcode()) {
33602 // clang-format off
33603 default: llvm_unreachable("Should not custom lower this!");
33604 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33605 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33606 return LowerCMP_SWAP(Op, Subtarget, DAG);
33607 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33608 case ISD::ATOMIC_LOAD_ADD:
33609 case ISD::ATOMIC_LOAD_SUB:
33610 case ISD::ATOMIC_LOAD_OR:
33611 case ISD::ATOMIC_LOAD_XOR:
33612 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33613 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33614 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33615 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33616 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33617 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33618 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33619 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33620 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33621 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33622 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33623 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33624 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33625 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33626 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33627 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33628 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33629 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33630 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33631 case ISD::SHL_PARTS:
33632 case ISD::SRA_PARTS:
33633 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33634 case ISD::FSHL:
33635 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33636 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33638 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33640 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33641 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33642 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33643 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33644 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33647 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33648 case ISD::FP_TO_SINT:
33650 case ISD::FP_TO_UINT:
33651 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33653 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33654 case ISD::FP_EXTEND:
33655 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33656 case ISD::FP_ROUND:
33657 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33658 case ISD::FP16_TO_FP:
33659 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33660 case ISD::FP_TO_FP16:
33661 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33662 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33663 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33664 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33665 case ISD::FADD:
33666 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33667 case ISD::FROUND: return LowerFROUND(Op, DAG);
33668 case ISD::FABS:
33669 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33670 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33671 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33672 case ISD::LRINT:
33673 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33674 case ISD::SETCC:
33675 case ISD::STRICT_FSETCC:
33676 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33677 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33678 case ISD::SELECT: return LowerSELECT(Op, DAG);
33679 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33680 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33681 case ISD::VASTART: return LowerVASTART(Op, DAG);
33682 case ISD::VAARG: return LowerVAARG(Op, DAG);
33683 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33684 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33686 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33687 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33688 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33689 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33691 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33692 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33693 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33694 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33695 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33697 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33698 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33699 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33700 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33701 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33702 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33703 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33704 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33705 case ISD::CTLZ:
33706 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33707 case ISD::CTTZ:
33708 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33709 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33710 case ISD::MULHS:
33711 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33712 case ISD::ROTL:
33713 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33714 case ISD::SRA:
33715 case ISD::SRL:
33716 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33717 case ISD::SADDO:
33718 case ISD::UADDO:
33719 case ISD::SSUBO:
33720 case ISD::USUBO: return LowerXALUO(Op, DAG);
33721 case ISD::SMULO:
33722 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33723 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33724 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33725 case ISD::SADDO_CARRY:
33726 case ISD::SSUBO_CARRY:
33727 case ISD::UADDO_CARRY:
33728 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33729 case ISD::ADD:
33730 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33731 case ISD::UADDSAT:
33732 case ISD::SADDSAT:
33733 case ISD::USUBSAT:
33734 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33735 case ISD::SMAX:
33736 case ISD::SMIN:
33737 case ISD::UMAX:
33738 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33739 case ISD::FMINIMUM:
33740 case ISD::FMAXIMUM:
33741 case ISD::FMINIMUMNUM:
33742 case ISD::FMAXIMUMNUM:
33743 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33744 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33745 case ISD::ABDS:
33746 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33747 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33748 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33749 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33750 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33751 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33752 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33753 case ISD::GC_TRANSITION_START:
33754 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33755 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33756 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33757 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33758 // clang-format on
33759 }
33760}
33761
33762/// Replace a node with an illegal result type with a new node built out of
33763/// custom code.
33766 SelectionDAG &DAG) const {
33767 SDLoc dl(N);
33768 unsigned Opc = N->getOpcode();
33769 switch (Opc) {
33770 default:
33771#ifndef NDEBUG
33772 dbgs() << "ReplaceNodeResults: ";
33773 N->dump(&DAG);
33774#endif
33775 llvm_unreachable("Do not know how to custom type legalize this operation!");
33776 case X86ISD::CVTPH2PS: {
33777 EVT VT = N->getValueType(0);
33778 SDValue Lo, Hi;
33779 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33780 EVT LoVT, HiVT;
33781 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33782 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33783 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33784 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33785 Results.push_back(Res);
33786 return;
33787 }
33789 EVT VT = N->getValueType(0);
33790 SDValue Lo, Hi;
33791 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33792 EVT LoVT, HiVT;
33793 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33794 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33795 {N->getOperand(0), Lo});
33796 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33797 {N->getOperand(0), Hi});
33798 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33799 Lo.getValue(1), Hi.getValue(1));
33800 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33801 Results.push_back(Res);
33802 Results.push_back(Chain);
33803 return;
33804 }
33805 case X86ISD::CVTPS2PH:
33806 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33807 return;
33808 case ISD::CTPOP: {
33809 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33810 // If we have at most 32 active bits, then perform as i32 CTPOP.
33811 // TODO: Perform this in generic legalizer?
33812 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33813 unsigned LZ = Known.countMinLeadingZeros();
33814 unsigned TZ = Known.countMinTrailingZeros();
33815 if ((LZ + TZ) >= 32) {
33816 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33817 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33818 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33819 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33820 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33821 Results.push_back(Op);
33822 return;
33823 }
33824 // Use a v2i64 if possible.
33825 bool NoImplicitFloatOps =
33827 Attribute::NoImplicitFloat);
33828 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33829 SDValue Wide =
33830 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33831 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33832 // Bit count should fit in 32-bits, extract it as that and then zero
33833 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33834 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33835 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33836 DAG.getVectorIdxConstant(0, dl));
33837 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33838 Results.push_back(Wide);
33839 }
33840 return;
33841 }
33842 case ISD::MUL: {
33843 EVT VT = N->getValueType(0);
33845 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33846 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33847 // elements are needed.
33848 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33849 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33850 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33851 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33852 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33853 unsigned NumConcats = 16 / VT.getVectorNumElements();
33854 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33855 ConcatOps[0] = Res;
33856 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33857 Results.push_back(Res);
33858 return;
33859 }
33860 case ISD::SMULO:
33861 case ISD::UMULO: {
33862 EVT VT = N->getValueType(0);
33864 VT == MVT::v2i32 && "Unexpected VT!");
33865 bool IsSigned = Opc == ISD::SMULO;
33866 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33867 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33868 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33869 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33870 // Extract the high 32 bits from each result using PSHUFD.
33871 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33872 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33873 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33874 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33875 DAG.getVectorIdxConstant(0, dl));
33876
33877 // Truncate the low bits of the result. This will become PSHUFD.
33878 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33879
33880 SDValue HiCmp;
33881 if (IsSigned) {
33882 // SMULO overflows if the high bits don't match the sign of the low.
33883 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33884 } else {
33885 // UMULO overflows if the high bits are non-zero.
33886 HiCmp = DAG.getConstant(0, dl, VT);
33887 }
33888 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33889
33890 // Widen the result with by padding with undef.
33891 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33892 DAG.getUNDEF(VT));
33893 Results.push_back(Res);
33894 Results.push_back(Ovf);
33895 return;
33896 }
33897 case X86ISD::VPMADDWD: {
33898 // Legalize types for X86ISD::VPMADDWD by widening.
33899 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33900
33901 EVT VT = N->getValueType(0);
33902 EVT InVT = N->getOperand(0).getValueType();
33903 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33904 "Expected a VT that divides into 128 bits.");
33906 "Unexpected type action!");
33907 unsigned NumConcat = 128 / InVT.getSizeInBits();
33908
33909 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33910 InVT.getVectorElementType(),
33911 NumConcat * InVT.getVectorNumElements());
33912 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33914 NumConcat * VT.getVectorNumElements());
33915
33916 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33917 Ops[0] = N->getOperand(0);
33918 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33919 Ops[0] = N->getOperand(1);
33920 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33921
33922 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33923 Results.push_back(Res);
33924 return;
33925 }
33926 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33927 case X86ISD::FMINC:
33928 case X86ISD::FMIN:
33929 case X86ISD::FMAXC:
33930 case X86ISD::FMAX:
33932 case X86ISD::STRICT_FMAX: {
33933 EVT VT = N->getValueType(0);
33934 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33935 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33936 SDValue UNDEF = DAG.getUNDEF(VT);
33937 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33938 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33939 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33940 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33941 SDValue Res;
33942 if (IsStrict)
33943 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33944 {N->getOperand(0), LHS, RHS});
33945 else
33946 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33947 Results.push_back(Res);
33948 if (IsStrict)
33949 Results.push_back(Res.getValue(1));
33950 return;
33951 }
33952 case ISD::SDIV:
33953 case ISD::UDIV:
33954 case ISD::SREM:
33955 case ISD::UREM: {
33956 EVT VT = N->getValueType(0);
33957 if (VT.isVector()) {
33959 "Unexpected type action!");
33960 // If this RHS is a constant splat vector we can widen this and let
33961 // division/remainder by constant optimize it.
33962 // TODO: Can we do something for non-splat?
33963 APInt SplatVal;
33964 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33965 unsigned NumConcats = 128 / VT.getSizeInBits();
33966 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33967 Ops0[0] = N->getOperand(0);
33968 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33969 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33970 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33971 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33972 Results.push_back(Res);
33973 }
33974 return;
33975 }
33976
33977 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33978 Results.push_back(V);
33979 return;
33980 }
33981 case ISD::TRUNCATE: {
33982 MVT VT = N->getSimpleValueType(0);
33983 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33984 return;
33985
33986 // The generic legalizer will try to widen the input type to the same
33987 // number of elements as the widened result type. But this isn't always
33988 // the best thing so do some custom legalization to avoid some cases.
33989 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33990 SDValue In = N->getOperand(0);
33991 EVT InVT = In.getValueType();
33992 EVT InEltVT = InVT.getVectorElementType();
33993 EVT EltVT = VT.getVectorElementType();
33994 unsigned MinElts = VT.getVectorNumElements();
33995 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33996 unsigned InBits = InVT.getSizeInBits();
33997
33998 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33999 unsigned PackOpcode;
34000 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34001 Subtarget, N->getFlags())) {
34002 if (SDValue Res =
34003 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34004 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34005 Results.push_back(Res);
34006 return;
34007 }
34008 }
34009
34010 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34011 // 128 bit and smaller inputs should avoid truncate all together and
34012 // use a shuffle.
34013 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34014 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34015 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34016 for (unsigned I = 0; I < MinElts; ++I)
34017 TruncMask[I] = Scale * I;
34018 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34019 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34020 "Illegal vector type in truncation");
34021 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34022 Results.push_back(
34023 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34024 return;
34025 }
34026 }
34027
34028 // With AVX512 there are some cases that can use a target specific
34029 // truncate node to go from 256/512 to less than 128 with zeros in the
34030 // upper elements of the 128 bit result.
34031 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34032 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34033 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34034 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34035 return;
34036 }
34037 // There's one case we can widen to 512 bits and use VTRUNC.
34038 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34039 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34040 DAG.getUNDEF(MVT::v4i64));
34041 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34042 return;
34043 }
34044 }
34045 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34046 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34047 isTypeLegal(MVT::v4i64)) {
34048 // Input needs to be split and output needs to widened. Let's use two
34049 // VTRUNCs, and shuffle their results together into the wider type.
34050 SDValue Lo, Hi;
34051 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34052
34053 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34054 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34055 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34056 { 0, 1, 2, 3, 16, 17, 18, 19,
34057 -1, -1, -1, -1, -1, -1, -1, -1 });
34058 Results.push_back(Res);
34059 return;
34060 }
34061
34062 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34063 // this via type legalization.
34064 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34065 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34066 (!Subtarget.hasSSSE3() ||
34067 (!isTypeLegal(InVT) &&
34068 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34069 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34070 InEltVT.getSizeInBits() * WidenNumElts);
34071 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34072 return;
34073 }
34074
34075 return;
34076 }
34077 case ISD::ANY_EXTEND:
34078 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34079 // It's intended to custom handle the input type.
34080 assert(N->getValueType(0) == MVT::v8i8 &&
34081 "Do not know how to legalize this Node");
34082 return;
34083 case ISD::SIGN_EXTEND:
34084 case ISD::ZERO_EXTEND: {
34085 EVT VT = N->getValueType(0);
34086 SDValue In = N->getOperand(0);
34087 EVT InVT = In.getValueType();
34088 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34089 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34091 "Unexpected type action!");
34092 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34093 // Custom split this so we can extend i8/i16->i32 invec. This is better
34094 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34095 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34096 // we allow the sra from the extend to i32 to be shared by the split.
34097 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34098
34099 // Fill a vector with sign bits for each element.
34100 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34101 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34102
34103 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34104 // to v2i64.
34105 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34106 {0, 4, 1, 5});
34107 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34108 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34109 {2, 6, 3, 7});
34110 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34111
34112 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34113 Results.push_back(Res);
34114 return;
34115 }
34116
34117 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34118 if (!InVT.is128BitVector()) {
34119 // Not a 128 bit vector, but maybe type legalization will promote
34120 // it to 128 bits.
34121 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34122 return;
34123 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34124 if (!InVT.is128BitVector())
34125 return;
34126
34127 // Promote the input to 128 bits. Type legalization will turn this into
34128 // zext_inreg/sext_inreg.
34129 In = DAG.getNode(Opc, dl, InVT, In);
34130 }
34131
34132 // Perform custom splitting instead of the two stage extend we would get
34133 // by default.
34134 EVT LoVT, HiVT;
34135 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34136 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34137
34138 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34139
34140 // We need to shift the input over by half the number of elements.
34141 unsigned NumElts = InVT.getVectorNumElements();
34142 unsigned HalfNumElts = NumElts / 2;
34143 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34144 for (unsigned i = 0; i != HalfNumElts; ++i)
34145 ShufMask[i] = i + HalfNumElts;
34146
34147 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34148 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34149
34150 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34151 Results.push_back(Res);
34152 }
34153 return;
34154 }
34156 case ISD::FP_TO_UINT_SAT: {
34157 if (!Subtarget.hasAVX10_2())
34158 return;
34159
34160 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34161 EVT VT = N->getValueType(0);
34162 SDValue Op = N->getOperand(0);
34163 EVT OpVT = Op.getValueType();
34164 SDValue Res;
34165
34166 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34167 if (IsSigned)
34168 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34169 else
34170 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34171 Results.push_back(Res);
34172 }
34173 return;
34174 }
34175 case ISD::FP_TO_SINT:
34177 case ISD::FP_TO_UINT:
34179 bool IsStrict = N->isStrictFPOpcode();
34180 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34181 EVT VT = N->getValueType(0);
34182 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34183 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34184 EVT SrcVT = Src.getValueType();
34185
34186 SDValue Res;
34187 if (isSoftF16(SrcVT, Subtarget)) {
34188 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34189 if (IsStrict) {
34190 Res =
34191 DAG.getNode(Opc, dl, {VT, MVT::Other},
34192 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34193 {NVT, MVT::Other}, {Chain, Src})});
34194 Chain = Res.getValue(1);
34195 } else {
34196 Res =
34197 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34198 }
34199 Results.push_back(Res);
34200 if (IsStrict)
34201 Results.push_back(Chain);
34202
34203 return;
34204 }
34205
34206 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34207 SrcVT.getVectorElementType() == MVT::f16) {
34208 EVT EleVT = VT.getVectorElementType();
34209 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34210
34211 if (SrcVT != MVT::v8f16) {
34212 SDValue Tmp =
34213 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34214 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34215 Ops[0] = Src;
34216 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34217 }
34218
34219 if (IsStrict) {
34221 Res =
34222 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34223 Chain = Res.getValue(1);
34224 } else {
34225 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34226 Res = DAG.getNode(Opc, dl, ResVT, Src);
34227 }
34228
34229 // TODO: Need to add exception check code for strict FP.
34230 if (EleVT.getSizeInBits() < 16) {
34231 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34232 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34233
34234 // Now widen to 128 bits.
34235 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34236 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34237 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34238 ConcatOps[0] = Res;
34239 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34240 }
34241
34242 Results.push_back(Res);
34243 if (IsStrict)
34244 Results.push_back(Chain);
34245
34246 return;
34247 }
34248
34249 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34251 "Unexpected type action!");
34252
34253 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34254 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34255 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34257 SDValue Res;
34258 SDValue Chain;
34259 if (IsStrict) {
34260 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34261 {N->getOperand(0), Src});
34262 Chain = Res.getValue(1);
34263 } else
34264 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34265
34266 // Preserve what we know about the size of the original result. If the
34267 // result is v2i32, we have to manually widen the assert.
34268 if (PromoteVT == MVT::v2i32)
34269 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34270 DAG.getUNDEF(MVT::v2i32));
34271
34272 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34273 Res.getValueType(), Res,
34275
34276 if (PromoteVT == MVT::v2i32)
34277 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34278 DAG.getVectorIdxConstant(0, dl));
34279
34280 // Truncate back to the original width.
34281 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34282
34283 // Now widen to 128 bits.
34284 unsigned NumConcats = 128 / VT.getSizeInBits();
34286 VT.getVectorNumElements() * NumConcats);
34287 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34288 ConcatOps[0] = Res;
34289 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34290 Results.push_back(Res);
34291 if (IsStrict)
34292 Results.push_back(Chain);
34293 return;
34294 }
34295
34296
34297 if (VT == MVT::v2i32) {
34298 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34299 "Strict unsigned conversion requires AVX512");
34300 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34302 "Unexpected type action!");
34303 if (Src.getValueType() == MVT::v2f64) {
34304 if (!IsSigned && !Subtarget.hasAVX512()) {
34305 SDValue Res =
34306 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34307 Results.push_back(Res);
34308 return;
34309 }
34310
34311 if (IsStrict)
34313 else
34314 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34315
34316 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34317 if (!IsSigned && !Subtarget.hasVLX()) {
34318 // Otherwise we can defer to the generic legalizer which will widen
34319 // the input as well. This will be further widened during op
34320 // legalization to v8i32<-v8f64.
34321 // For strict nodes we'll need to widen ourselves.
34322 // FIXME: Fix the type legalizer to safely widen strict nodes?
34323 if (!IsStrict)
34324 return;
34325 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34326 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34327 Opc = N->getOpcode();
34328 }
34329 SDValue Res;
34330 SDValue Chain;
34331 if (IsStrict) {
34332 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34333 {N->getOperand(0), Src});
34334 Chain = Res.getValue(1);
34335 } else {
34336 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34337 }
34338 Results.push_back(Res);
34339 if (IsStrict)
34340 Results.push_back(Chain);
34341 return;
34342 }
34343
34344 // Custom widen strict v2f32->v2i32 by padding with zeros.
34345 // FIXME: Should generic type legalizer do this?
34346 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34347 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34348 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34349 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34350 {N->getOperand(0), Src});
34351 Results.push_back(Res);
34352 Results.push_back(Res.getValue(1));
34353 return;
34354 }
34355
34356 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34357 // so early out here.
34358 return;
34359 }
34360
34361 assert(!VT.isVector() && "Vectors should have been handled above!");
34362
34363 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34364 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34365 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34366 assert(!Subtarget.is64Bit() && "i64 should be legal");
34367 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34368 // If we use a 128-bit result we might need to use a target specific node.
34369 unsigned SrcElts =
34370 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34371 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34372 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34373 if (NumElts != SrcElts) {
34374 if (IsStrict)
34376 else
34377 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34378 }
34379
34380 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34381 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34382 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34383 ZeroIdx);
34384 SDValue Chain;
34385 if (IsStrict) {
34386 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34387 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34388 Chain = Res.getValue(1);
34389 } else
34390 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34391 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34392 Results.push_back(Res);
34393 if (IsStrict)
34394 Results.push_back(Chain);
34395 return;
34396 }
34397
34398 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34399 SDValue Chain;
34400 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34401 Results.push_back(V);
34402 if (IsStrict)
34403 Results.push_back(Chain);
34404 return;
34405 }
34406
34407 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34408 Results.push_back(V);
34409 if (IsStrict)
34410 Results.push_back(Chain);
34411 }
34412 return;
34413 }
34414 case ISD::LRINT:
34415 if (N->getValueType(0) == MVT::v2i32) {
34416 SDValue Src = N->getOperand(0);
34417 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34419 DAG.getUNDEF(MVT::v2f16));
34420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34421 DAG.getUNDEF(MVT::v4f16));
34422 } else if (Src.getValueType() != MVT::v2f64) {
34423 return;
34424 }
34425 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34426 return;
34427 }
34428 [[fallthrough]];
34429 case ISD::LLRINT: {
34430 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34431 Results.push_back(V);
34432 return;
34433 }
34434
34435 case ISD::SINT_TO_FP:
34437 case ISD::UINT_TO_FP:
34439 bool IsStrict = N->isStrictFPOpcode();
34440 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34441 EVT VT = N->getValueType(0);
34442 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34443 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34444 Subtarget.hasVLX()) {
34445 if (Src.getValueType().getVectorElementType() == MVT::i16)
34446 return;
34447
34448 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34449 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34450 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34451 : DAG.getUNDEF(MVT::v2i32));
34452 if (IsStrict) {
34453 unsigned Opc =
34455 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34456 {N->getOperand(0), Src});
34457 Results.push_back(Res);
34458 Results.push_back(Res.getValue(1));
34459 } else {
34460 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34461 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34462 }
34463 return;
34464 }
34465 if (VT != MVT::v2f32)
34466 return;
34467 EVT SrcVT = Src.getValueType();
34468 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34469 if (IsStrict) {
34470 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34472 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34473 {N->getOperand(0), Src});
34474 Results.push_back(Res);
34475 Results.push_back(Res.getValue(1));
34476 } else {
34477 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34478 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34479 }
34480 return;
34481 }
34482 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34483 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34484 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34485 SDValue One = DAG.getConstant(1, dl, SrcVT);
34486 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34487 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34488 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34489 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34490 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34491 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34492 for (int i = 0; i != 2; ++i) {
34493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34494 SignSrc, DAG.getVectorIdxConstant(i, dl));
34495 if (IsStrict)
34496 SignCvts[i] =
34497 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34498 {N->getOperand(0), Elt});
34499 else
34500 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34501 };
34502 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34503 SDValue Slow, Chain;
34504 if (IsStrict) {
34505 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34506 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34507 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34508 {Chain, SignCvt, SignCvt});
34509 Chain = Slow.getValue(1);
34510 } else {
34511 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34512 }
34513 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34514 IsNeg =
34515 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34516 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34517 Results.push_back(Cvt);
34518 if (IsStrict)
34519 Results.push_back(Chain);
34520 return;
34521 }
34522
34523 if (SrcVT != MVT::v2i32)
34524 return;
34525
34526 if (IsSigned || Subtarget.hasAVX512()) {
34527 if (!IsStrict)
34528 return;
34529
34530 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34531 // FIXME: Should generic type legalizer do this?
34532 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34533 DAG.getConstant(0, dl, MVT::v2i32));
34534 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34535 {N->getOperand(0), Src});
34536 Results.push_back(Res);
34537 Results.push_back(Res.getValue(1));
34538 return;
34539 }
34540
34541 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34542 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34543 SDValue VBias = DAG.getConstantFP(
34544 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34545 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34546 DAG.getBitcast(MVT::v2i64, VBias));
34547 Or = DAG.getBitcast(MVT::v2f64, Or);
34548 if (IsStrict) {
34549 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34550 {N->getOperand(0), Or, VBias});
34552 {MVT::v4f32, MVT::Other},
34553 {Sub.getValue(1), Sub});
34554 Results.push_back(Res);
34555 Results.push_back(Res.getValue(1));
34556 } else {
34557 // TODO: Are there any fast-math-flags to propagate here?
34558 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34559 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34560 }
34561 return;
34562 }
34564 case ISD::FP_ROUND: {
34565 bool IsStrict = N->isStrictFPOpcode();
34566 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34567 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34568 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34569 EVT SrcVT = Src.getValueType();
34570 EVT VT = N->getValueType(0);
34571 SDValue V;
34572 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34573 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34574 : DAG.getUNDEF(MVT::v2f32);
34575 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34576 }
34577 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34578 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34579 if (SrcVT.getVectorElementType() != MVT::f32)
34580 return;
34581
34582 if (IsStrict)
34583 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34584 {Chain, Src, Rnd});
34585 else
34586 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34587
34588 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34589 if (IsStrict)
34590 Results.push_back(V.getValue(1));
34591 return;
34592 }
34593 if (!isTypeLegal(Src.getValueType()))
34594 return;
34595 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34596 if (IsStrict)
34597 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34598 {Chain, Src});
34599 else
34600 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34601 Results.push_back(V);
34602 if (IsStrict)
34603 Results.push_back(V.getValue(1));
34604 return;
34605 }
34606 case ISD::FP_EXTEND:
34607 case ISD::STRICT_FP_EXTEND: {
34608 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34609 // No other ValueType for FP_EXTEND should reach this point.
34610 assert(N->getValueType(0) == MVT::v2f32 &&
34611 "Do not know how to legalize this Node");
34612 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34613 return;
34614 bool IsStrict = N->isStrictFPOpcode();
34615 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34616 if (Src.getValueType().getVectorElementType() != MVT::f16)
34617 return;
34618 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34619 : DAG.getUNDEF(MVT::v2f16);
34620 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34621 if (IsStrict)
34622 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34623 {N->getOperand(0), V});
34624 else
34625 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34626 Results.push_back(V);
34627 if (IsStrict)
34628 Results.push_back(V.getValue(1));
34629 return;
34630 }
34632 unsigned IntNo = N->getConstantOperandVal(1);
34633 switch (IntNo) {
34634 default : llvm_unreachable("Do not know how to custom type "
34635 "legalize this intrinsic operation!");
34636 case Intrinsic::x86_rdtsc:
34637 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34638 Results);
34639 case Intrinsic::x86_rdtscp:
34640 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34641 Results);
34642 case Intrinsic::x86_rdpmc:
34643 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34644 Results);
34645 return;
34646 case Intrinsic::x86_rdpru:
34647 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34648 Results);
34649 return;
34650 case Intrinsic::x86_xgetbv:
34651 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34652 Results);
34653 return;
34654 }
34655 }
34656 case ISD::READCYCLECOUNTER: {
34657 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34658 }
34659 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34660 EVT T = N->getValueType(0);
34661 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34662 bool Regs64bit = T == MVT::i128;
34663 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34664 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34665 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34666 SDValue cpInL, cpInH;
34667 std::tie(cpInL, cpInH) =
34668 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34669 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34670 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34671 cpInH =
34672 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34673 cpInH, cpInL.getValue(1));
34674 SDValue swapInL, swapInH;
34675 std::tie(swapInL, swapInH) =
34676 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34677 swapInH =
34678 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34679 swapInH, cpInH.getValue(1));
34680
34681 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34682 // until later. So we keep the RBX input in a vreg and use a custom
34683 // inserter.
34684 // Since RBX will be a reserved register the register allocator will not
34685 // make sure its value will be properly saved and restored around this
34686 // live-range.
34687 SDValue Result;
34688 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34689 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34690 if (Regs64bit) {
34691 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34692 swapInH.getValue(1)};
34693 Result =
34694 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34695 } else {
34696 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34697 swapInH.getValue(1));
34698 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34699 swapInL.getValue(1)};
34700 Result =
34701 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34702 }
34703
34704 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34705 Regs64bit ? X86::RAX : X86::EAX,
34706 HalfT, Result.getValue(1));
34707 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34708 Regs64bit ? X86::RDX : X86::EDX,
34709 HalfT, cpOutL.getValue(2));
34710 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34711
34712 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34713 MVT::i32, cpOutH.getValue(2));
34714 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34715 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34716
34717 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34718 Results.push_back(Success);
34719 Results.push_back(EFLAGS.getValue(1));
34720 return;
34721 }
34722 case ISD::ATOMIC_LOAD: {
34723 assert(
34724 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34725 "Unexpected VT!");
34726 bool NoImplicitFloatOps =
34728 Attribute::NoImplicitFloat);
34729 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34730 auto *Node = cast<AtomicSDNode>(N);
34731
34732 if (N->getValueType(0) == MVT::i128) {
34733 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34734 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34735 Node->getBasePtr(), Node->getMemOperand());
34736 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34737 DAG.getVectorIdxConstant(0, dl));
34738 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34739 DAG.getVectorIdxConstant(1, dl));
34740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34741 {ResL, ResH}));
34742 Results.push_back(Ld.getValue(1));
34743 return;
34744 }
34745 break;
34746 }
34747 if (Subtarget.hasSSE1()) {
34748 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34749 // Then extract the lower 64-bits.
34750 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34751 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34752 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34754 MVT::i64, Node->getMemOperand());
34755 if (Subtarget.hasSSE2()) {
34756 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34757 DAG.getVectorIdxConstant(0, dl));
34758 Results.push_back(Res);
34759 Results.push_back(Ld.getValue(1));
34760 return;
34761 }
34762 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34763 // then casts to i64. This avoids a 128-bit stack temporary being
34764 // created by type legalization if we were to cast v4f32->v2i64.
34765 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34766 DAG.getVectorIdxConstant(0, dl));
34767 Res = DAG.getBitcast(MVT::i64, Res);
34768 Results.push_back(Res);
34769 Results.push_back(Ld.getValue(1));
34770 return;
34771 }
34772 if (Subtarget.hasX87()) {
34773 // First load this into an 80-bit X87 register. This will put the whole
34774 // integer into the significand.
34775 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34776 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34778 dl, Tys, Ops, MVT::i64,
34779 Node->getMemOperand());
34780 SDValue Chain = Result.getValue(1);
34781
34782 // Now store the X87 register to a stack temporary and convert to i64.
34783 // This store is not atomic and doesn't need to be.
34784 // FIXME: We don't need a stack temporary if the result of the load
34785 // is already being stored. We could just directly store there.
34786 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34787 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34788 MachinePointerInfo MPI =
34790 SDValue StoreOps[] = { Chain, Result, StackPtr };
34791 Chain = DAG.getMemIntrinsicNode(
34792 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34793 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34794
34795 // Finally load the value back from the stack temporary and return it.
34796 // This load is not atomic and doesn't need to be.
34797 // This load will be further type legalized.
34798 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34799 Results.push_back(Result);
34800 Results.push_back(Result.getValue(1));
34801 return;
34802 }
34803 }
34804 // TODO: Use MOVLPS when SSE1 is available?
34805 // Delegate to generic TypeLegalization. Situations we can really handle
34806 // should have already been dealt with by AtomicExpandPass.cpp.
34807 break;
34808 }
34809 case ISD::ATOMIC_SWAP:
34810 case ISD::ATOMIC_LOAD_ADD:
34811 case ISD::ATOMIC_LOAD_SUB:
34812 case ISD::ATOMIC_LOAD_AND:
34813 case ISD::ATOMIC_LOAD_OR:
34814 case ISD::ATOMIC_LOAD_XOR:
34815 case ISD::ATOMIC_LOAD_NAND:
34816 case ISD::ATOMIC_LOAD_MIN:
34817 case ISD::ATOMIC_LOAD_MAX:
34818 case ISD::ATOMIC_LOAD_UMIN:
34819 case ISD::ATOMIC_LOAD_UMAX:
34820 // Delegate to generic TypeLegalization. Situations we can really handle
34821 // should have already been dealt with by AtomicExpandPass.cpp.
34822 break;
34823
34824 case ISD::BITCAST: {
34825 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34826 EVT DstVT = N->getValueType(0);
34827 EVT SrcVT = N->getOperand(0).getValueType();
34828
34829 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34830 // we can split using the k-register rather than memory.
34831 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34832 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34833 SDValue Lo, Hi;
34834 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34835 Lo = DAG.getBitcast(MVT::i32, Lo);
34836 Hi = DAG.getBitcast(MVT::i32, Hi);
34837 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34838 Results.push_back(Res);
34839 return;
34840 }
34841
34842 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34843 // FIXME: Use v4f32 for SSE1?
34844 assert(Subtarget.hasSSE2() && "Requires SSE2");
34845 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34846 "Unexpected type action!");
34847 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34848 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34849 N->getOperand(0));
34850 Res = DAG.getBitcast(WideVT, Res);
34851 Results.push_back(Res);
34852 return;
34853 }
34854
34855 return;
34856 }
34857 case ISD::MGATHER: {
34858 EVT VT = N->getValueType(0);
34859 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34860 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34861 auto *Gather = cast<MaskedGatherSDNode>(N);
34862 SDValue Index = Gather->getIndex();
34863 if (Index.getValueType() != MVT::v2i64)
34864 return;
34866 "Unexpected type action!");
34867 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34868 SDValue Mask = Gather->getMask();
34869 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34870 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34871 Gather->getPassThru(),
34872 DAG.getUNDEF(VT));
34873 if (!Subtarget.hasVLX()) {
34874 // We need to widen the mask, but the instruction will only use 2
34875 // of its elements. So we can use undef.
34876 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34877 DAG.getUNDEF(MVT::v2i1));
34878 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34879 }
34880 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34881 Gather->getBasePtr(), Index, Gather->getScale() };
34882 SDValue Res = DAG.getMemIntrinsicNode(
34883 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34884 Gather->getMemoryVT(), Gather->getMemOperand());
34885 Results.push_back(Res);
34886 Results.push_back(Res.getValue(1));
34887 return;
34888 }
34889 return;
34890 }
34891 case ISD::LOAD: {
34892 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34893 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34894 // cast since type legalization will try to use an i64 load.
34895 MVT VT = N->getSimpleValueType(0);
34896 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34898 "Unexpected type action!");
34899 if (!ISD::isNON_EXTLoad(N))
34900 return;
34901 auto *Ld = cast<LoadSDNode>(N);
34902 if (Subtarget.hasSSE2()) {
34903 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34904 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34905 Ld->getPointerInfo(), Ld->getBaseAlign(),
34906 Ld->getMemOperand()->getFlags());
34907 SDValue Chain = Res.getValue(1);
34908 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34909 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34910 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34911 Res = DAG.getBitcast(WideVT, Res);
34912 Results.push_back(Res);
34913 Results.push_back(Chain);
34914 return;
34915 }
34916 assert(Subtarget.hasSSE1() && "Expected SSE");
34917 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34918 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34920 MVT::i64, Ld->getMemOperand());
34921 Results.push_back(Res);
34922 Results.push_back(Res.getValue(1));
34923 return;
34924 }
34925 case ISD::ADDRSPACECAST: {
34926 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34927 Results.push_back(V);
34928 return;
34929 }
34930 case ISD::BITREVERSE: {
34931 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34932 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34933 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34934 // We'll need to move the scalar in two i32 pieces.
34935 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34936 return;
34937 }
34939 // f16 = extract vXf16 %vec, i64 %idx
34940 assert(N->getSimpleValueType(0) == MVT::f16 &&
34941 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34942 assert(Subtarget.hasFP16() && "Expected FP16");
34943 SDValue VecOp = N->getOperand(0);
34945 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34946 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34947 N->getOperand(1));
34948 Split = DAG.getBitcast(MVT::f16, Split);
34949 Results.push_back(Split);
34950 return;
34951 }
34952 }
34953}
34954
34955const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34956 switch ((X86ISD::NodeType)Opcode) {
34957 case X86ISD::FIRST_NUMBER: break;
34958#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34959 NODE_NAME_CASE(BSF)
34960 NODE_NAME_CASE(BSR)
34961 NODE_NAME_CASE(FSHL)
34962 NODE_NAME_CASE(FSHR)
34963 NODE_NAME_CASE(FAND)
34964 NODE_NAME_CASE(FANDN)
34965 NODE_NAME_CASE(FOR)
34966 NODE_NAME_CASE(FXOR)
34967 NODE_NAME_CASE(FILD)
34968 NODE_NAME_CASE(FIST)
34969 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34970 NODE_NAME_CASE(FLD)
34971 NODE_NAME_CASE(FST)
34972 NODE_NAME_CASE(CALL)
34973 NODE_NAME_CASE(CALL_RVMARKER)
34974 NODE_NAME_CASE(IMP_CALL)
34976 NODE_NAME_CASE(CMP)
34977 NODE_NAME_CASE(FCMP)
34978 NODE_NAME_CASE(STRICT_FCMP)
34979 NODE_NAME_CASE(STRICT_FCMPS)
34981 NODE_NAME_CASE(UCOMI)
34982 NODE_NAME_CASE(COMX)
34983 NODE_NAME_CASE(UCOMX)
34984 NODE_NAME_CASE(CMPM)
34985 NODE_NAME_CASE(CMPMM)
34986 NODE_NAME_CASE(STRICT_CMPM)
34987 NODE_NAME_CASE(CMPMM_SAE)
34988 NODE_NAME_CASE(SETCC)
34989 NODE_NAME_CASE(SETCC_CARRY)
34990 NODE_NAME_CASE(FSETCC)
34991 NODE_NAME_CASE(FSETCCM)
34992 NODE_NAME_CASE(FSETCCM_SAE)
34993 NODE_NAME_CASE(CMOV)
34994 NODE_NAME_CASE(BRCOND)
34995 NODE_NAME_CASE(RET_GLUE)
34996 NODE_NAME_CASE(IRET)
34997 NODE_NAME_CASE(REP_STOS)
34998 NODE_NAME_CASE(REP_MOVS)
34999 NODE_NAME_CASE(GlobalBaseReg)
35001 NODE_NAME_CASE(WrapperRIP)
35002 NODE_NAME_CASE(MOVQ2DQ)
35003 NODE_NAME_CASE(MOVDQ2Q)
35004 NODE_NAME_CASE(MMX_MOVD2W)
35005 NODE_NAME_CASE(MMX_MOVW2D)
35006 NODE_NAME_CASE(PEXTRB)
35007 NODE_NAME_CASE(PEXTRW)
35008 NODE_NAME_CASE(INSERTPS)
35009 NODE_NAME_CASE(PINSRB)
35010 NODE_NAME_CASE(PINSRW)
35011 NODE_NAME_CASE(PSHUFB)
35012 NODE_NAME_CASE(ANDNP)
35013 NODE_NAME_CASE(BLENDI)
35015 NODE_NAME_CASE(HADD)
35016 NODE_NAME_CASE(HSUB)
35017 NODE_NAME_CASE(FHADD)
35018 NODE_NAME_CASE(FHSUB)
35019 NODE_NAME_CASE(CONFLICT)
35020 NODE_NAME_CASE(FMAX)
35021 NODE_NAME_CASE(FMAXS)
35022 NODE_NAME_CASE(FMAX_SAE)
35023 NODE_NAME_CASE(FMAXS_SAE)
35024 NODE_NAME_CASE(STRICT_FMAX)
35025 NODE_NAME_CASE(FMIN)
35026 NODE_NAME_CASE(FMINS)
35027 NODE_NAME_CASE(FMIN_SAE)
35028 NODE_NAME_CASE(FMINS_SAE)
35029 NODE_NAME_CASE(STRICT_FMIN)
35030 NODE_NAME_CASE(FMAXC)
35031 NODE_NAME_CASE(FMINC)
35032 NODE_NAME_CASE(FRSQRT)
35033 NODE_NAME_CASE(FRCP)
35034 NODE_NAME_CASE(EXTRQI)
35035 NODE_NAME_CASE(INSERTQI)
35036 NODE_NAME_CASE(TLSADDR)
35037 NODE_NAME_CASE(TLSBASEADDR)
35038 NODE_NAME_CASE(TLSCALL)
35039 NODE_NAME_CASE(TLSDESC)
35040 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35041 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35042 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35043 NODE_NAME_CASE(EH_RETURN)
35044 NODE_NAME_CASE(TC_RETURN)
35045 NODE_NAME_CASE(FNSTCW16m)
35046 NODE_NAME_CASE(FLDCW16m)
35047 NODE_NAME_CASE(FNSTENVm)
35048 NODE_NAME_CASE(FLDENVm)
35049 NODE_NAME_CASE(LCMPXCHG_DAG)
35050 NODE_NAME_CASE(LCMPXCHG8_DAG)
35051 NODE_NAME_CASE(LCMPXCHG16_DAG)
35052 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35053 NODE_NAME_CASE(LADD)
35054 NODE_NAME_CASE(LSUB)
35055 NODE_NAME_CASE(LOR)
35056 NODE_NAME_CASE(LXOR)
35057 NODE_NAME_CASE(LAND)
35058 NODE_NAME_CASE(LBTS)
35059 NODE_NAME_CASE(LBTC)
35060 NODE_NAME_CASE(LBTR)
35061 NODE_NAME_CASE(LBTS_RM)
35062 NODE_NAME_CASE(LBTC_RM)
35063 NODE_NAME_CASE(LBTR_RM)
35064 NODE_NAME_CASE(AADD)
35065 NODE_NAME_CASE(AOR)
35066 NODE_NAME_CASE(AXOR)
35067 NODE_NAME_CASE(AAND)
35068 NODE_NAME_CASE(VZEXT_MOVL)
35069 NODE_NAME_CASE(VZEXT_LOAD)
35070 NODE_NAME_CASE(VEXTRACT_STORE)
35071 NODE_NAME_CASE(VTRUNC)
35072 NODE_NAME_CASE(VTRUNCS)
35073 NODE_NAME_CASE(VTRUNCUS)
35074 NODE_NAME_CASE(VMTRUNC)
35075 NODE_NAME_CASE(VMTRUNCS)
35076 NODE_NAME_CASE(VMTRUNCUS)
35077 NODE_NAME_CASE(VTRUNCSTORES)
35078 NODE_NAME_CASE(VTRUNCSTOREUS)
35079 NODE_NAME_CASE(VMTRUNCSTORES)
35080 NODE_NAME_CASE(VMTRUNCSTOREUS)
35081 NODE_NAME_CASE(VFPEXT)
35082 NODE_NAME_CASE(STRICT_VFPEXT)
35083 NODE_NAME_CASE(VFPEXT_SAE)
35084 NODE_NAME_CASE(VFPEXTS)
35085 NODE_NAME_CASE(VFPEXTS_SAE)
35086 NODE_NAME_CASE(VFPROUND)
35087 NODE_NAME_CASE(VFPROUND2)
35088 NODE_NAME_CASE(VFPROUND2_RND)
35089 NODE_NAME_CASE(STRICT_VFPROUND)
35090 NODE_NAME_CASE(VMFPROUND)
35091 NODE_NAME_CASE(VFPROUND_RND)
35092 NODE_NAME_CASE(VFPROUNDS)
35093 NODE_NAME_CASE(VFPROUNDS_RND)
35094 NODE_NAME_CASE(VSHLDQ)
35095 NODE_NAME_CASE(VSRLDQ)
35096 NODE_NAME_CASE(VSHL)
35097 NODE_NAME_CASE(VSRL)
35098 NODE_NAME_CASE(VSRA)
35099 NODE_NAME_CASE(VSHLI)
35100 NODE_NAME_CASE(VSRLI)
35101 NODE_NAME_CASE(VSRAI)
35102 NODE_NAME_CASE(VSHLV)
35103 NODE_NAME_CASE(VSRLV)
35104 NODE_NAME_CASE(VSRAV)
35105 NODE_NAME_CASE(VROTLI)
35106 NODE_NAME_CASE(VROTRI)
35107 NODE_NAME_CASE(VPPERM)
35108 NODE_NAME_CASE(CMPP)
35109 NODE_NAME_CASE(STRICT_CMPP)
35110 NODE_NAME_CASE(PCMPEQ)
35111 NODE_NAME_CASE(PCMPGT)
35112 NODE_NAME_CASE(PHMINPOS)
35113 NODE_NAME_CASE(ADD)
35114 NODE_NAME_CASE(SUB)
35115 NODE_NAME_CASE(ADC)
35116 NODE_NAME_CASE(SBB)
35117 NODE_NAME_CASE(SMUL)
35118 NODE_NAME_CASE(UMUL)
35119 NODE_NAME_CASE(OR)
35120 NODE_NAME_CASE(XOR)
35121 NODE_NAME_CASE(AND)
35122 NODE_NAME_CASE(BEXTR)
35124 NODE_NAME_CASE(BZHI)
35125 NODE_NAME_CASE(PDEP)
35126 NODE_NAME_CASE(PEXT)
35127 NODE_NAME_CASE(MUL_IMM)
35128 NODE_NAME_CASE(MOVMSK)
35129 NODE_NAME_CASE(PTEST)
35130 NODE_NAME_CASE(TESTP)
35131 NODE_NAME_CASE(KORTEST)
35132 NODE_NAME_CASE(KTEST)
35133 NODE_NAME_CASE(KADD)
35134 NODE_NAME_CASE(KSHIFTL)
35135 NODE_NAME_CASE(KSHIFTR)
35136 NODE_NAME_CASE(PACKSS)
35137 NODE_NAME_CASE(PACKUS)
35138 NODE_NAME_CASE(PALIGNR)
35139 NODE_NAME_CASE(VALIGN)
35140 NODE_NAME_CASE(VSHLD)
35141 NODE_NAME_CASE(VSHRD)
35142 NODE_NAME_CASE(PSHUFD)
35143 NODE_NAME_CASE(PSHUFHW)
35144 NODE_NAME_CASE(PSHUFLW)
35145 NODE_NAME_CASE(SHUFP)
35146 NODE_NAME_CASE(SHUF128)
35147 NODE_NAME_CASE(MOVLHPS)
35148 NODE_NAME_CASE(MOVHLPS)
35149 NODE_NAME_CASE(MOVDDUP)
35150 NODE_NAME_CASE(MOVSHDUP)
35151 NODE_NAME_CASE(MOVSLDUP)
35152 NODE_NAME_CASE(MOVSD)
35153 NODE_NAME_CASE(MOVSS)
35154 NODE_NAME_CASE(MOVSH)
35155 NODE_NAME_CASE(UNPCKL)
35156 NODE_NAME_CASE(UNPCKH)
35157 NODE_NAME_CASE(VBROADCAST)
35158 NODE_NAME_CASE(VBROADCAST_LOAD)
35159 NODE_NAME_CASE(VBROADCASTM)
35160 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35161 NODE_NAME_CASE(VPERMILPV)
35162 NODE_NAME_CASE(VPERMILPI)
35163 NODE_NAME_CASE(VPERM2X128)
35164 NODE_NAME_CASE(VPERMV)
35165 NODE_NAME_CASE(VPERMV3)
35166 NODE_NAME_CASE(VPERMI)
35167 NODE_NAME_CASE(VPTERNLOG)
35168 NODE_NAME_CASE(FP_TO_SINT_SAT)
35169 NODE_NAME_CASE(FP_TO_UINT_SAT)
35170 NODE_NAME_CASE(VFIXUPIMM)
35171 NODE_NAME_CASE(VFIXUPIMM_SAE)
35172 NODE_NAME_CASE(VFIXUPIMMS)
35173 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35174 NODE_NAME_CASE(VRANGE)
35175 NODE_NAME_CASE(VRANGE_SAE)
35176 NODE_NAME_CASE(VRANGES)
35177 NODE_NAME_CASE(VRANGES_SAE)
35178 NODE_NAME_CASE(PMULUDQ)
35179 NODE_NAME_CASE(PMULDQ)
35180 NODE_NAME_CASE(PSADBW)
35181 NODE_NAME_CASE(DBPSADBW)
35182 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35183 NODE_NAME_CASE(VAARG_64)
35184 NODE_NAME_CASE(VAARG_X32)
35185 NODE_NAME_CASE(DYN_ALLOCA)
35186 NODE_NAME_CASE(MFENCE)
35187 NODE_NAME_CASE(SEG_ALLOCA)
35188 NODE_NAME_CASE(PROBED_ALLOCA)
35191 NODE_NAME_CASE(RDPKRU)
35192 NODE_NAME_CASE(WRPKRU)
35193 NODE_NAME_CASE(VPMADDUBSW)
35194 NODE_NAME_CASE(VPMADDWD)
35195 NODE_NAME_CASE(VPSHA)
35196 NODE_NAME_CASE(VPSHL)
35197 NODE_NAME_CASE(VPCOM)
35198 NODE_NAME_CASE(VPCOMU)
35199 NODE_NAME_CASE(VPERMIL2)
35201 NODE_NAME_CASE(STRICT_FMSUB)
35203 NODE_NAME_CASE(STRICT_FNMADD)
35205 NODE_NAME_CASE(STRICT_FNMSUB)
35206 NODE_NAME_CASE(FMADDSUB)
35207 NODE_NAME_CASE(FMSUBADD)
35208 NODE_NAME_CASE(FMADD_RND)
35209 NODE_NAME_CASE(FNMADD_RND)
35210 NODE_NAME_CASE(FMSUB_RND)
35211 NODE_NAME_CASE(FNMSUB_RND)
35212 NODE_NAME_CASE(FMADDSUB_RND)
35213 NODE_NAME_CASE(FMSUBADD_RND)
35214 NODE_NAME_CASE(VFMADDC)
35215 NODE_NAME_CASE(VFMADDC_RND)
35216 NODE_NAME_CASE(VFCMADDC)
35217 NODE_NAME_CASE(VFCMADDC_RND)
35218 NODE_NAME_CASE(VFMULC)
35219 NODE_NAME_CASE(VFMULC_RND)
35220 NODE_NAME_CASE(VFCMULC)
35221 NODE_NAME_CASE(VFCMULC_RND)
35222 NODE_NAME_CASE(VFMULCSH)
35223 NODE_NAME_CASE(VFMULCSH_RND)
35224 NODE_NAME_CASE(VFCMULCSH)
35225 NODE_NAME_CASE(VFCMULCSH_RND)
35226 NODE_NAME_CASE(VFMADDCSH)
35227 NODE_NAME_CASE(VFMADDCSH_RND)
35228 NODE_NAME_CASE(VFCMADDCSH)
35229 NODE_NAME_CASE(VFCMADDCSH_RND)
35230 NODE_NAME_CASE(VPMADD52H)
35231 NODE_NAME_CASE(VPMADD52L)
35232 NODE_NAME_CASE(VRNDSCALE)
35233 NODE_NAME_CASE(STRICT_VRNDSCALE)
35234 NODE_NAME_CASE(VRNDSCALE_SAE)
35235 NODE_NAME_CASE(VRNDSCALES)
35236 NODE_NAME_CASE(VRNDSCALES_SAE)
35237 NODE_NAME_CASE(VREDUCE)
35238 NODE_NAME_CASE(VREDUCE_SAE)
35239 NODE_NAME_CASE(VREDUCES)
35240 NODE_NAME_CASE(VREDUCES_SAE)
35241 NODE_NAME_CASE(VGETMANT)
35242 NODE_NAME_CASE(VGETMANT_SAE)
35243 NODE_NAME_CASE(VGETMANTS)
35244 NODE_NAME_CASE(VGETMANTS_SAE)
35245 NODE_NAME_CASE(PCMPESTR)
35246 NODE_NAME_CASE(PCMPISTR)
35248 NODE_NAME_CASE(COMPRESS)
35250 NODE_NAME_CASE(SELECTS)
35251 NODE_NAME_CASE(ADDSUB)
35252 NODE_NAME_CASE(RCP14)
35253 NODE_NAME_CASE(RCP14S)
35254 NODE_NAME_CASE(RSQRT14)
35255 NODE_NAME_CASE(RSQRT14S)
35256 NODE_NAME_CASE(FADD_RND)
35257 NODE_NAME_CASE(FADDS)
35258 NODE_NAME_CASE(FADDS_RND)
35259 NODE_NAME_CASE(FSUB_RND)
35260 NODE_NAME_CASE(FSUBS)
35261 NODE_NAME_CASE(FSUBS_RND)
35262 NODE_NAME_CASE(FMUL_RND)
35263 NODE_NAME_CASE(FMULS)
35264 NODE_NAME_CASE(FMULS_RND)
35265 NODE_NAME_CASE(FDIV_RND)
35266 NODE_NAME_CASE(FDIVS)
35267 NODE_NAME_CASE(FDIVS_RND)
35268 NODE_NAME_CASE(FSQRT_RND)
35269 NODE_NAME_CASE(FSQRTS)
35270 NODE_NAME_CASE(FSQRTS_RND)
35271 NODE_NAME_CASE(FGETEXP)
35272 NODE_NAME_CASE(FGETEXP_SAE)
35273 NODE_NAME_CASE(FGETEXPS)
35274 NODE_NAME_CASE(FGETEXPS_SAE)
35275 NODE_NAME_CASE(SCALEF)
35276 NODE_NAME_CASE(SCALEF_RND)
35277 NODE_NAME_CASE(SCALEFS)
35278 NODE_NAME_CASE(SCALEFS_RND)
35279 NODE_NAME_CASE(MULHRS)
35280 NODE_NAME_CASE(SINT_TO_FP_RND)
35281 NODE_NAME_CASE(UINT_TO_FP_RND)
35282 NODE_NAME_CASE(CVTTP2SI)
35283 NODE_NAME_CASE(CVTTP2UI)
35284 NODE_NAME_CASE(STRICT_CVTTP2SI)
35285 NODE_NAME_CASE(STRICT_CVTTP2UI)
35286 NODE_NAME_CASE(MCVTTP2SI)
35287 NODE_NAME_CASE(MCVTTP2UI)
35288 NODE_NAME_CASE(CVTTP2SI_SAE)
35289 NODE_NAME_CASE(CVTTP2UI_SAE)
35290 NODE_NAME_CASE(CVTTS2SI)
35291 NODE_NAME_CASE(CVTTS2UI)
35292 NODE_NAME_CASE(CVTTS2SI_SAE)
35293 NODE_NAME_CASE(CVTTS2UI_SAE)
35294 NODE_NAME_CASE(CVTSI2P)
35295 NODE_NAME_CASE(CVTUI2P)
35296 NODE_NAME_CASE(STRICT_CVTSI2P)
35297 NODE_NAME_CASE(STRICT_CVTUI2P)
35298 NODE_NAME_CASE(MCVTSI2P)
35299 NODE_NAME_CASE(MCVTUI2P)
35300 NODE_NAME_CASE(VFPCLASS)
35301 NODE_NAME_CASE(VFPCLASSS)
35302 NODE_NAME_CASE(MULTISHIFT)
35303 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35304 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35305 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35306 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35307 NODE_NAME_CASE(CVTPS2PH)
35308 NODE_NAME_CASE(STRICT_CVTPS2PH)
35309 NODE_NAME_CASE(CVTPS2PH_SAE)
35310 NODE_NAME_CASE(MCVTPS2PH)
35311 NODE_NAME_CASE(MCVTPS2PH_SAE)
35312 NODE_NAME_CASE(CVTPH2PS)
35313 NODE_NAME_CASE(STRICT_CVTPH2PS)
35314 NODE_NAME_CASE(CVTPH2PS_SAE)
35315 NODE_NAME_CASE(CVTP2SI)
35316 NODE_NAME_CASE(CVTP2UI)
35317 NODE_NAME_CASE(MCVTP2SI)
35318 NODE_NAME_CASE(MCVTP2UI)
35319 NODE_NAME_CASE(CVTP2SI_RND)
35320 NODE_NAME_CASE(CVTP2UI_RND)
35321 NODE_NAME_CASE(CVTS2SI)
35322 NODE_NAME_CASE(CVTS2UI)
35323 NODE_NAME_CASE(CVTS2SI_RND)
35324 NODE_NAME_CASE(CVTS2UI_RND)
35325 NODE_NAME_CASE(CVTNEPS2BF16)
35326 NODE_NAME_CASE(MCVTNEPS2BF16)
35327 NODE_NAME_CASE(DPBF16PS)
35328 NODE_NAME_CASE(DPFP16PS)
35329 NODE_NAME_CASE(MPSADBW)
35330 NODE_NAME_CASE(LWPINS)
35331 NODE_NAME_CASE(MGATHER)
35332 NODE_NAME_CASE(MSCATTER)
35333 NODE_NAME_CASE(VPDPBUSD)
35334 NODE_NAME_CASE(VPDPBUSDS)
35335 NODE_NAME_CASE(VPDPWSSD)
35336 NODE_NAME_CASE(VPDPWSSDS)
35337 NODE_NAME_CASE(VPSHUFBITQMB)
35338 NODE_NAME_CASE(GF2P8MULB)
35339 NODE_NAME_CASE(GF2P8AFFINEQB)
35340 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35341 NODE_NAME_CASE(NT_CALL)
35342 NODE_NAME_CASE(NT_BRIND)
35343 NODE_NAME_CASE(UMWAIT)
35344 NODE_NAME_CASE(TPAUSE)
35345 NODE_NAME_CASE(ENQCMD)
35346 NODE_NAME_CASE(ENQCMDS)
35347 NODE_NAME_CASE(VP2INTERSECT)
35348 NODE_NAME_CASE(VPDPBSUD)
35349 NODE_NAME_CASE(VPDPBSUDS)
35350 NODE_NAME_CASE(VPDPBUUD)
35351 NODE_NAME_CASE(VPDPBUUDS)
35352 NODE_NAME_CASE(VPDPBSSD)
35353 NODE_NAME_CASE(VPDPBSSDS)
35354 NODE_NAME_CASE(VPDPWSUD)
35355 NODE_NAME_CASE(VPDPWSUDS)
35356 NODE_NAME_CASE(VPDPWUSD)
35357 NODE_NAME_CASE(VPDPWUSDS)
35358 NODE_NAME_CASE(VPDPWUUD)
35359 NODE_NAME_CASE(VPDPWUUDS)
35360 NODE_NAME_CASE(VMINMAX)
35361 NODE_NAME_CASE(VMINMAX_SAE)
35362 NODE_NAME_CASE(VMINMAXS)
35363 NODE_NAME_CASE(VMINMAXS_SAE)
35364 NODE_NAME_CASE(CVTP2IBS)
35365 NODE_NAME_CASE(CVTP2IUBS)
35366 NODE_NAME_CASE(CVTP2IBS_RND)
35367 NODE_NAME_CASE(CVTP2IUBS_RND)
35368 NODE_NAME_CASE(CVTTP2IBS)
35369 NODE_NAME_CASE(CVTTP2IUBS)
35370 NODE_NAME_CASE(CVTTP2IBS_SAE)
35371 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35372 NODE_NAME_CASE(VCVT2PH2BF8)
35373 NODE_NAME_CASE(VCVT2PH2BF8S)
35374 NODE_NAME_CASE(VCVT2PH2HF8)
35375 NODE_NAME_CASE(VCVT2PH2HF8S)
35376 NODE_NAME_CASE(VCVTBIASPH2BF8)
35377 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35378 NODE_NAME_CASE(VCVTBIASPH2HF8)
35379 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35380 NODE_NAME_CASE(VCVTPH2BF8)
35381 NODE_NAME_CASE(VCVTPH2BF8S)
35382 NODE_NAME_CASE(VCVTPH2HF8)
35383 NODE_NAME_CASE(VCVTPH2HF8S)
35384 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35385 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35386 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35387 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35388 NODE_NAME_CASE(VMCVTPH2BF8)
35389 NODE_NAME_CASE(VMCVTPH2BF8S)
35390 NODE_NAME_CASE(VMCVTPH2HF8)
35391 NODE_NAME_CASE(VMCVTPH2HF8S)
35392 NODE_NAME_CASE(VCVTHF82PH)
35393 NODE_NAME_CASE(AESENC128KL)
35394 NODE_NAME_CASE(AESDEC128KL)
35395 NODE_NAME_CASE(AESENC256KL)
35396 NODE_NAME_CASE(AESDEC256KL)
35397 NODE_NAME_CASE(AESENCWIDE128KL)
35398 NODE_NAME_CASE(AESDECWIDE128KL)
35399 NODE_NAME_CASE(AESENCWIDE256KL)
35400 NODE_NAME_CASE(AESDECWIDE256KL)
35401 NODE_NAME_CASE(CMPCCXADD)
35402 NODE_NAME_CASE(TESTUI)
35403 NODE_NAME_CASE(FP80_ADD)
35404 NODE_NAME_CASE(STRICT_FP80_ADD)
35405 NODE_NAME_CASE(CCMP)
35406 NODE_NAME_CASE(CTEST)
35407 NODE_NAME_CASE(CLOAD)
35408 NODE_NAME_CASE(CSTORE)
35409 NODE_NAME_CASE(CVTTS2SIS)
35410 NODE_NAME_CASE(CVTTS2UIS)
35411 NODE_NAME_CASE(CVTTS2SIS_SAE)
35412 NODE_NAME_CASE(CVTTS2UIS_SAE)
35413 NODE_NAME_CASE(CVTTP2SIS)
35414 NODE_NAME_CASE(MCVTTP2SIS)
35415 NODE_NAME_CASE(CVTTP2UIS_SAE)
35416 NODE_NAME_CASE(CVTTP2SIS_SAE)
35417 NODE_NAME_CASE(CVTTP2UIS)
35418 NODE_NAME_CASE(MCVTTP2UIS)
35419 NODE_NAME_CASE(POP_FROM_X87_REG)
35420 }
35421 return nullptr;
35422#undef NODE_NAME_CASE
35423}
35424
35425/// Return true if the addressing mode represented by AM is legal for this
35426/// target, for a load/store of the specified type.
35428 const AddrMode &AM, Type *Ty,
35429 unsigned AS,
35430 Instruction *I) const {
35431 // X86 supports extremely general addressing modes.
35433
35434 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35435 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35436 return false;
35437
35438 if (AM.BaseGV) {
35439 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35440
35441 // If a reference to this global requires an extra load, we can't fold it.
35442 if (isGlobalStubReference(GVFlags))
35443 return false;
35444
35445 // If BaseGV requires a register for the PIC base, we cannot also have a
35446 // BaseReg specified.
35447 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35448 return false;
35449
35450 // If lower 4G is not available, then we must use rip-relative addressing.
35451 if ((M != CodeModel::Small || isPositionIndependent()) &&
35452 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35453 return false;
35454 }
35455
35456 switch (AM.Scale) {
35457 case 0:
35458 case 1:
35459 case 2:
35460 case 4:
35461 case 8:
35462 // These scales always work.
35463 break;
35464 case 3:
35465 case 5:
35466 case 9:
35467 // These scales are formed with basereg+scalereg. Only accept if there is
35468 // no basereg yet.
35469 if (AM.HasBaseReg)
35470 return false;
35471 break;
35472 default: // Other stuff never works.
35473 return false;
35474 }
35475
35476 return true;
35477}
35478
35479bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35480 switch (Opcode) {
35481 // These are non-commutative binops.
35482 // TODO: Add more X86ISD opcodes once we have test coverage.
35483 case X86ISD::ANDNP:
35484 case X86ISD::PCMPGT:
35485 case X86ISD::FMAX:
35486 case X86ISD::FMIN:
35487 case X86ISD::FANDN:
35488 case X86ISD::VPSHA:
35489 case X86ISD::VPSHL:
35490 case X86ISD::VSHLV:
35491 case X86ISD::VSRLV:
35492 case X86ISD::VSRAV:
35493 return true;
35494 }
35495
35496 return TargetLoweringBase::isBinOp(Opcode);
35497}
35498
35499bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35500 switch (Opcode) {
35501 // TODO: Add more X86ISD opcodes once we have test coverage.
35502 case X86ISD::PCMPEQ:
35503 case X86ISD::PMULDQ:
35504 case X86ISD::PMULUDQ:
35505 case X86ISD::FMAXC:
35506 case X86ISD::FMINC:
35507 case X86ISD::FAND:
35508 case X86ISD::FOR:
35509 case X86ISD::FXOR:
35510 return true;
35511 }
35512
35514}
35515
35517 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35518 return false;
35519 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35520 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35521 return NumBits1 > NumBits2;
35522}
35523
35525 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35526 return false;
35527
35528 if (!isTypeLegal(EVT::getEVT(Ty1)))
35529 return false;
35530
35531 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35532
35533 // Assuming the caller doesn't have a zeroext or signext return parameter,
35534 // truncation all the way down to i1 is valid.
35535 return true;
35536}
35537
35539 return isInt<32>(Imm);
35540}
35541
35543 // Can also use sub to handle negated immediates.
35544 return isInt<32>(Imm);
35545}
35546
35548 return isInt<32>(Imm);
35549}
35550
35552 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35553 return false;
35554 unsigned NumBits1 = VT1.getSizeInBits();
35555 unsigned NumBits2 = VT2.getSizeInBits();
35556 return NumBits1 > NumBits2;
35557}
35558
35560 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35561 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35562}
35563
35565 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35566 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35567}
35568
35570 EVT VT1 = Val.getValueType();
35571 if (isZExtFree(VT1, VT2))
35572 return true;
35573
35574 if (Val.getOpcode() != ISD::LOAD)
35575 return false;
35576
35577 if (!VT1.isSimple() || !VT1.isInteger() ||
35578 !VT2.isSimple() || !VT2.isInteger())
35579 return false;
35580
35581 switch (VT1.getSimpleVT().SimpleTy) {
35582 default: break;
35583 case MVT::i8:
35584 case MVT::i16:
35585 case MVT::i32:
35586 // X86 has 8, 16, and 32-bit zero-extending loads.
35587 return true;
35588 }
35589
35590 return false;
35591}
35592
35594 if (!Subtarget.is64Bit())
35595 return false;
35596 return TargetLowering::shouldConvertPhiType(From, To);
35597}
35598
35600 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35601 return false;
35602
35603 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35604
35605 // There is no extending load for vXi1.
35606 if (SrcVT.getScalarType() == MVT::i1)
35607 return false;
35608
35609 return true;
35610}
35611
35613 EVT VT) const {
35614 if (Subtarget.useSoftFloat())
35615 return false;
35616
35617 if (!Subtarget.hasAnyFMA())
35618 return false;
35619
35620 VT = VT.getScalarType();
35621
35622 if (!VT.isSimple())
35623 return false;
35624
35625 switch (VT.getSimpleVT().SimpleTy) {
35626 case MVT::f16:
35627 return Subtarget.hasFP16();
35628 case MVT::f32:
35629 case MVT::f64:
35630 return true;
35631 default:
35632 break;
35633 }
35634
35635 return false;
35636}
35637
35639 EVT DestVT) const {
35640 // i16 instructions are longer (0x66 prefix) and potentially slower.
35641 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35642}
35643
35645 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35646 SDValue Y) const {
35647 if (SelectOpcode == ISD::SELECT) {
35648 if (VT.isVector())
35649 return false;
35650 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35651 return false;
35652 using namespace llvm::SDPatternMatch;
35653 // BLSI
35654 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35656 return true;
35657 // BLSR
35658 if (BinOpcode == ISD::AND &&
35661 return true;
35662 // BLSMSK
35663 if (BinOpcode == ISD::XOR &&
35666 return true;
35667
35668 return false;
35669 }
35670 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35671 // benefit. The transform may also be profitable for scalar code.
35672 if (!Subtarget.hasAVX512())
35673 return false;
35674 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35675 return false;
35676 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35677 return false;
35678
35679 return true;
35680}
35681
35682/// Targets can use this to indicate that they only support *some*
35683/// VECTOR_SHUFFLE operations, those with specific masks.
35684/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35685/// are assumed to be legal.
35687 if (!VT.isSimple())
35688 return false;
35689
35690 // Not for i1 vectors
35691 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35692 return false;
35693
35694 // Very little shuffling can be done for 64-bit vectors right now.
35695 if (VT.getSimpleVT().getSizeInBits() == 64)
35696 return false;
35697
35698 // We only care that the types being shuffled are legal. The lowering can
35699 // handle any possible shuffle mask that results.
35700 return isTypeLegal(VT.getSimpleVT());
35701}
35702
35704 EVT VT) const {
35705 // Don't convert an 'and' into a shuffle that we don't directly support.
35706 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35707 if (!Subtarget.hasAVX2())
35708 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35709 return false;
35710
35711 // Just delegate to the generic legality, clear masks aren't special.
35712 return isShuffleMaskLegal(Mask, VT);
35713}
35714
35716 // If the subtarget is using thunks, we need to not generate jump tables.
35717 if (Subtarget.useIndirectThunkBranches())
35718 return false;
35719
35720 // Otherwise, fallback on the generic logic.
35722}
35723
35725 EVT ConditionVT) const {
35726 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35727 // zero-extensions.
35728 if (ConditionVT.getSizeInBits() < 32)
35729 return MVT::i32;
35731 ConditionVT);
35732}
35733
35734//===----------------------------------------------------------------------===//
35735// X86 Scheduler Hooks
35736//===----------------------------------------------------------------------===//
35737
35738/// Utility function to emit xbegin specifying the start of an RTM region.
35740 const TargetInstrInfo *TII) {
35741 const MIMetadata MIMD(MI);
35742
35743 const BasicBlock *BB = MBB->getBasicBlock();
35744 MachineFunction::iterator I = ++MBB->getIterator();
35745
35746 // For the v = xbegin(), we generate
35747 //
35748 // thisMBB:
35749 // xbegin sinkMBB
35750 //
35751 // mainMBB:
35752 // s0 = -1
35753 //
35754 // fallBB:
35755 // eax = # XABORT_DEF
35756 // s1 = eax
35757 //
35758 // sinkMBB:
35759 // v = phi(s0/mainBB, s1/fallBB)
35760
35761 MachineBasicBlock *thisMBB = MBB;
35762 MachineFunction *MF = MBB->getParent();
35763 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35764 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35765 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35766 MF->insert(I, mainMBB);
35767 MF->insert(I, fallMBB);
35768 MF->insert(I, sinkMBB);
35769
35770 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35771 mainMBB->addLiveIn(X86::EFLAGS);
35772 fallMBB->addLiveIn(X86::EFLAGS);
35773 sinkMBB->addLiveIn(X86::EFLAGS);
35774 }
35775
35776 // Transfer the remainder of BB and its successor edges to sinkMBB.
35777 sinkMBB->splice(sinkMBB->begin(), MBB,
35778 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35780
35782 Register DstReg = MI.getOperand(0).getReg();
35783 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35784 Register mainDstReg = MRI.createVirtualRegister(RC);
35785 Register fallDstReg = MRI.createVirtualRegister(RC);
35786
35787 // thisMBB:
35788 // xbegin fallMBB
35789 // # fallthrough to mainMBB
35790 // # abortion to fallMBB
35791 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35792 thisMBB->addSuccessor(mainMBB);
35793 thisMBB->addSuccessor(fallMBB);
35794
35795 // mainMBB:
35796 // mainDstReg := -1
35797 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35798 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35799 mainMBB->addSuccessor(sinkMBB);
35800
35801 // fallMBB:
35802 // ; pseudo instruction to model hardware's definition from XABORT
35803 // EAX := XABORT_DEF
35804 // fallDstReg := EAX
35805 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35806 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35807 .addReg(X86::EAX);
35808 fallMBB->addSuccessor(sinkMBB);
35809
35810 // sinkMBB:
35811 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35812 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35813 .addReg(mainDstReg).addMBB(mainMBB)
35814 .addReg(fallDstReg).addMBB(fallMBB);
35815
35816 MI.eraseFromParent();
35817 return sinkMBB;
35818}
35819
35821X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35822 MachineBasicBlock *MBB) const {
35823 // Emit va_arg instruction on X86-64.
35824
35825 // Operands to this pseudo-instruction:
35826 // 0 ) Output : destination address (reg)
35827 // 1-5) Input : va_list address (addr, i64mem)
35828 // 6 ) ArgSize : Size (in bytes) of vararg type
35829 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35830 // 8 ) Align : Alignment of type
35831 // 9 ) EFLAGS (implicit-def)
35832
35833 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35834 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35835
35836 Register DestReg = MI.getOperand(0).getReg();
35837 MachineOperand &Base = MI.getOperand(1);
35838 MachineOperand &Scale = MI.getOperand(2);
35839 MachineOperand &Index = MI.getOperand(3);
35840 MachineOperand &Disp = MI.getOperand(4);
35841 MachineOperand &Segment = MI.getOperand(5);
35842 unsigned ArgSize = MI.getOperand(6).getImm();
35843 unsigned ArgMode = MI.getOperand(7).getImm();
35844 Align Alignment = Align(MI.getOperand(8).getImm());
35845
35846 MachineFunction *MF = MBB->getParent();
35847
35848 // Memory Reference
35849 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35850
35851 MachineMemOperand *OldMMO = MI.memoperands().front();
35852
35853 // Clone the MMO into two separate MMOs for loading and storing
35854 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35855 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35856 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35857 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35858
35859 // Machine Information
35860 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35861 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35862 const TargetRegisterClass *AddrRegClass =
35864 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35865 const MIMetadata MIMD(MI);
35866
35867 // struct va_list {
35868 // i32 gp_offset
35869 // i32 fp_offset
35870 // i64 overflow_area (address)
35871 // i64 reg_save_area (address)
35872 // }
35873 // sizeof(va_list) = 24
35874 // alignment(va_list) = 8
35875
35876 unsigned TotalNumIntRegs = 6;
35877 unsigned TotalNumXMMRegs = 8;
35878 bool UseGPOffset = (ArgMode == 1);
35879 bool UseFPOffset = (ArgMode == 2);
35880 unsigned MaxOffset = TotalNumIntRegs * 8 +
35881 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35882
35883 /* Align ArgSize to a multiple of 8 */
35884 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35885 bool NeedsAlign = (Alignment > 8);
35886
35887 MachineBasicBlock *thisMBB = MBB;
35888 MachineBasicBlock *overflowMBB;
35889 MachineBasicBlock *offsetMBB;
35890 MachineBasicBlock *endMBB;
35891
35892 Register OffsetDestReg; // Argument address computed by offsetMBB
35893 Register OverflowDestReg; // Argument address computed by overflowMBB
35894 Register OffsetReg;
35895
35896 if (!UseGPOffset && !UseFPOffset) {
35897 // If we only pull from the overflow region, we don't create a branch.
35898 // We don't need to alter control flow.
35899 OffsetDestReg = Register(); // unused
35900 OverflowDestReg = DestReg;
35901
35902 offsetMBB = nullptr;
35903 overflowMBB = thisMBB;
35904 endMBB = thisMBB;
35905 } else {
35906 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35907 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35908 // If not, pull from overflow_area. (branch to overflowMBB)
35909 //
35910 // thisMBB
35911 // | .
35912 // | .
35913 // offsetMBB overflowMBB
35914 // | .
35915 // | .
35916 // endMBB
35917
35918 // Registers for the PHI in endMBB
35919 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35920 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35921
35922 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35923 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35925 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35926
35928
35929 // Insert the new basic blocks
35930 MF->insert(MBBIter, offsetMBB);
35931 MF->insert(MBBIter, overflowMBB);
35932 MF->insert(MBBIter, endMBB);
35933
35934 // Transfer the remainder of MBB and its successor edges to endMBB.
35935 endMBB->splice(endMBB->begin(), thisMBB,
35936 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35937 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35938
35939 // Make offsetMBB and overflowMBB successors of thisMBB
35940 thisMBB->addSuccessor(offsetMBB);
35941 thisMBB->addSuccessor(overflowMBB);
35942
35943 // endMBB is a successor of both offsetMBB and overflowMBB
35944 offsetMBB->addSuccessor(endMBB);
35945 overflowMBB->addSuccessor(endMBB);
35946
35947 // Load the offset value into a register
35948 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35949 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35950 .add(Base)
35951 .add(Scale)
35952 .add(Index)
35953 .addDisp(Disp, UseFPOffset ? 4 : 0)
35954 .add(Segment)
35955 .setMemRefs(LoadOnlyMMO);
35956
35957 // Check if there is enough room left to pull this argument.
35958 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35959 .addReg(OffsetReg)
35960 .addImm(MaxOffset + 8 - ArgSizeA8);
35961
35962 // Branch to "overflowMBB" if offset >= max
35963 // Fall through to "offsetMBB" otherwise
35964 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35965 .addMBB(overflowMBB).addImm(X86::COND_AE);
35966 }
35967
35968 // In offsetMBB, emit code to use the reg_save_area.
35969 if (offsetMBB) {
35970 assert(OffsetReg != 0);
35971
35972 // Read the reg_save_area address.
35973 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35974 BuildMI(
35975 offsetMBB, MIMD,
35976 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35977 RegSaveReg)
35978 .add(Base)
35979 .add(Scale)
35980 .add(Index)
35981 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35982 .add(Segment)
35983 .setMemRefs(LoadOnlyMMO);
35984
35985 if (Subtarget.isTarget64BitLP64()) {
35986 // Zero-extend the offset
35987 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35988 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35989 .addImm(0)
35990 .addReg(OffsetReg)
35991 .addImm(X86::sub_32bit);
35992
35993 // Add the offset to the reg_save_area to get the final address.
35994 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35995 .addReg(OffsetReg64)
35996 .addReg(RegSaveReg);
35997 } else {
35998 // Add the offset to the reg_save_area to get the final address.
35999 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36000 .addReg(OffsetReg)
36001 .addReg(RegSaveReg);
36002 }
36003
36004 // Compute the offset for the next argument
36005 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36006 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36007 .addReg(OffsetReg)
36008 .addImm(UseFPOffset ? 16 : 8);
36009
36010 // Store it back into the va_list.
36011 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36012 .add(Base)
36013 .add(Scale)
36014 .add(Index)
36015 .addDisp(Disp, UseFPOffset ? 4 : 0)
36016 .add(Segment)
36017 .addReg(NextOffsetReg)
36018 .setMemRefs(StoreOnlyMMO);
36019
36020 // Jump to endMBB
36021 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36022 .addMBB(endMBB);
36023 }
36024
36025 //
36026 // Emit code to use overflow area
36027 //
36028
36029 // Load the overflow_area address into a register.
36030 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36031 BuildMI(overflowMBB, MIMD,
36032 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36033 OverflowAddrReg)
36034 .add(Base)
36035 .add(Scale)
36036 .add(Index)
36037 .addDisp(Disp, 8)
36038 .add(Segment)
36039 .setMemRefs(LoadOnlyMMO);
36040
36041 // If we need to align it, do so. Otherwise, just copy the address
36042 // to OverflowDestReg.
36043 if (NeedsAlign) {
36044 // Align the overflow address
36045 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36046
36047 // aligned_addr = (addr + (align-1)) & ~(align-1)
36048 BuildMI(
36049 overflowMBB, MIMD,
36050 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36051 TmpReg)
36052 .addReg(OverflowAddrReg)
36053 .addImm(Alignment.value() - 1);
36054
36055 BuildMI(
36056 overflowMBB, MIMD,
36057 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36058 OverflowDestReg)
36059 .addReg(TmpReg)
36060 .addImm(~(uint64_t)(Alignment.value() - 1));
36061 } else {
36062 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36063 .addReg(OverflowAddrReg);
36064 }
36065
36066 // Compute the next overflow address after this argument.
36067 // (the overflow address should be kept 8-byte aligned)
36068 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36069 BuildMI(
36070 overflowMBB, MIMD,
36071 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36072 NextAddrReg)
36073 .addReg(OverflowDestReg)
36074 .addImm(ArgSizeA8);
36075
36076 // Store the new overflow address.
36077 BuildMI(overflowMBB, MIMD,
36078 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36079 .add(Base)
36080 .add(Scale)
36081 .add(Index)
36082 .addDisp(Disp, 8)
36083 .add(Segment)
36084 .addReg(NextAddrReg)
36085 .setMemRefs(StoreOnlyMMO);
36086
36087 // If we branched, emit the PHI to the front of endMBB.
36088 if (offsetMBB) {
36089 BuildMI(*endMBB, endMBB->begin(), MIMD,
36090 TII->get(X86::PHI), DestReg)
36091 .addReg(OffsetDestReg).addMBB(offsetMBB)
36092 .addReg(OverflowDestReg).addMBB(overflowMBB);
36093 }
36094
36095 // Erase the pseudo instruction
36096 MI.eraseFromParent();
36097
36098 return endMBB;
36099}
36100
36101// The EFLAGS operand of SelectItr might be missing a kill marker
36102// because there were multiple uses of EFLAGS, and ISel didn't know
36103// which to mark. Figure out whether SelectItr should have had a
36104// kill marker, and set it if it should. Returns the correct kill
36105// marker value.
36108 const TargetRegisterInfo* TRI) {
36109 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36110 return false;
36111
36112 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36113 // out. SelectMI should have a kill flag on EFLAGS.
36114 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36115 return true;
36116}
36117
36118// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36119// together with other CMOV pseudo-opcodes into a single basic-block with
36120// conditional jump around it.
36122 switch (MI.getOpcode()) {
36123 case X86::CMOV_FR16:
36124 case X86::CMOV_FR16X:
36125 case X86::CMOV_FR32:
36126 case X86::CMOV_FR32X:
36127 case X86::CMOV_FR64:
36128 case X86::CMOV_FR64X:
36129 case X86::CMOV_GR8:
36130 case X86::CMOV_GR16:
36131 case X86::CMOV_GR32:
36132 case X86::CMOV_RFP32:
36133 case X86::CMOV_RFP64:
36134 case X86::CMOV_RFP80:
36135 case X86::CMOV_VR64:
36136 case X86::CMOV_VR128:
36137 case X86::CMOV_VR128X:
36138 case X86::CMOV_VR256:
36139 case X86::CMOV_VR256X:
36140 case X86::CMOV_VR512:
36141 case X86::CMOV_VK1:
36142 case X86::CMOV_VK2:
36143 case X86::CMOV_VK4:
36144 case X86::CMOV_VK8:
36145 case X86::CMOV_VK16:
36146 case X86::CMOV_VK32:
36147 case X86::CMOV_VK64:
36148 return true;
36149
36150 default:
36151 return false;
36152 }
36153}
36154
36155// Helper function, which inserts PHI functions into SinkMBB:
36156// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36157// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36158// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36159// the last PHI function inserted.
36162 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36163 MachineBasicBlock *SinkMBB) {
36164 MachineFunction *MF = TrueMBB->getParent();
36166 const MIMetadata MIMD(*MIItBegin);
36167
36168 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36170
36171 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36172
36173 // As we are creating the PHIs, we have to be careful if there is more than
36174 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36175 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36176 // That also means that PHI construction must work forward from earlier to
36177 // later, and that the code must maintain a mapping from earlier PHI's
36178 // destination registers, and the registers that went into the PHI.
36181
36182 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36183 Register DestReg = MIIt->getOperand(0).getReg();
36184 Register Op1Reg = MIIt->getOperand(1).getReg();
36185 Register Op2Reg = MIIt->getOperand(2).getReg();
36186
36187 // If this CMOV we are generating is the opposite condition from
36188 // the jump we generated, then we have to swap the operands for the
36189 // PHI that is going to be generated.
36190 if (MIIt->getOperand(3).getImm() == OppCC)
36191 std::swap(Op1Reg, Op2Reg);
36192
36193 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36194 Op1Reg = It->second.first;
36195
36196 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36197 Op2Reg = It->second.second;
36198
36199 MIB =
36200 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36201 .addReg(Op1Reg)
36202 .addMBB(FalseMBB)
36203 .addReg(Op2Reg)
36204 .addMBB(TrueMBB);
36205
36206 // Add this PHI to the rewrite table.
36207 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36208 }
36209
36210 return MIB;
36211}
36212
36213// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36215X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36216 MachineInstr &SecondCascadedCMOV,
36217 MachineBasicBlock *ThisMBB) const {
36218 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36219 const MIMetadata MIMD(FirstCMOV);
36220
36221 // We lower cascaded CMOVs such as
36222 //
36223 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36224 //
36225 // to two successive branches.
36226 //
36227 // Without this, we would add a PHI between the two jumps, which ends up
36228 // creating a few copies all around. For instance, for
36229 //
36230 // (sitofp (zext (fcmp une)))
36231 //
36232 // we would generate:
36233 //
36234 // ucomiss %xmm1, %xmm0
36235 // movss <1.0f>, %xmm0
36236 // movaps %xmm0, %xmm1
36237 // jne .LBB5_2
36238 // xorps %xmm1, %xmm1
36239 // .LBB5_2:
36240 // jp .LBB5_4
36241 // movaps %xmm1, %xmm0
36242 // .LBB5_4:
36243 // retq
36244 //
36245 // because this custom-inserter would have generated:
36246 //
36247 // A
36248 // | \
36249 // | B
36250 // | /
36251 // C
36252 // | \
36253 // | D
36254 // | /
36255 // E
36256 //
36257 // A: X = ...; Y = ...
36258 // B: empty
36259 // C: Z = PHI [X, A], [Y, B]
36260 // D: empty
36261 // E: PHI [X, C], [Z, D]
36262 //
36263 // If we lower both CMOVs in a single step, we can instead generate:
36264 //
36265 // A
36266 // | \
36267 // | C
36268 // | /|
36269 // |/ |
36270 // | |
36271 // | D
36272 // | /
36273 // E
36274 //
36275 // A: X = ...; Y = ...
36276 // D: empty
36277 // E: PHI [X, A], [X, C], [Y, D]
36278 //
36279 // Which, in our sitofp/fcmp example, gives us something like:
36280 //
36281 // ucomiss %xmm1, %xmm0
36282 // movss <1.0f>, %xmm0
36283 // jne .LBB5_4
36284 // jp .LBB5_4
36285 // xorps %xmm0, %xmm0
36286 // .LBB5_4:
36287 // retq
36288 //
36289
36290 // We lower cascaded CMOV into two successive branches to the same block.
36291 // EFLAGS is used by both, so mark it as live in the second.
36292 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36293 MachineFunction *F = ThisMBB->getParent();
36294 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36296 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36297
36298 MachineFunction::iterator It = ++ThisMBB->getIterator();
36299 F->insert(It, FirstInsertedMBB);
36300 F->insert(It, SecondInsertedMBB);
36301 F->insert(It, SinkMBB);
36302
36303 // For a cascaded CMOV, we lower it to two successive branches to
36304 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36305 // the FirstInsertedMBB.
36306 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36307
36308 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36309 // live into the sink and copy blocks.
36310 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36311 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36312 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36313 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36314 SinkMBB->addLiveIn(X86::EFLAGS);
36315 }
36316
36317 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36318 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36319 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36320 ThisMBB->end());
36321 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36322
36323 // Fallthrough block for ThisMBB.
36324 ThisMBB->addSuccessor(FirstInsertedMBB);
36325 // The true block target of the first branch is always SinkMBB.
36326 ThisMBB->addSuccessor(SinkMBB);
36327 // Fallthrough block for FirstInsertedMBB.
36328 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36329 // The true block for the branch of FirstInsertedMBB.
36330 FirstInsertedMBB->addSuccessor(SinkMBB);
36331 // This is fallthrough.
36332 SecondInsertedMBB->addSuccessor(SinkMBB);
36333
36334 // Create the conditional branch instructions.
36335 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36336 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36337
36338 X86::CondCode SecondCC =
36339 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36340 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36341 .addMBB(SinkMBB)
36342 .addImm(SecondCC);
36343
36344 // SinkMBB:
36345 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36346 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36347 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36348 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36349 MachineInstrBuilder MIB =
36350 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36351 .addReg(Op1Reg)
36352 .addMBB(SecondInsertedMBB)
36353 .addReg(Op2Reg)
36354 .addMBB(ThisMBB);
36355
36356 // The second SecondInsertedMBB provides the same incoming value as the
36357 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36358 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36359
36360 // Now remove the CMOVs.
36361 FirstCMOV.eraseFromParent();
36362 SecondCascadedCMOV.eraseFromParent();
36363
36364 return SinkMBB;
36365}
36366
36368X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36369 MachineBasicBlock *ThisMBB) const {
36370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36371 const MIMetadata MIMD(MI);
36372
36373 // To "insert" a SELECT_CC instruction, we actually have to insert the
36374 // diamond control-flow pattern. The incoming instruction knows the
36375 // destination vreg to set, the condition code register to branch on, the
36376 // true/false values to select between and a branch opcode to use.
36377
36378 // ThisMBB:
36379 // ...
36380 // TrueVal = ...
36381 // cmpTY ccX, r1, r2
36382 // bCC copy1MBB
36383 // fallthrough --> FalseMBB
36384
36385 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36386 // as described above, by inserting a BB, and then making a PHI at the join
36387 // point to select the true and false operands of the CMOV in the PHI.
36388 //
36389 // The code also handles two different cases of multiple CMOV opcodes
36390 // in a row.
36391 //
36392 // Case 1:
36393 // In this case, there are multiple CMOVs in a row, all which are based on
36394 // the same condition setting (or the exact opposite condition setting).
36395 // In this case we can lower all the CMOVs using a single inserted BB, and
36396 // then make a number of PHIs at the join point to model the CMOVs. The only
36397 // trickiness here, is that in a case like:
36398 //
36399 // t2 = CMOV cond1 t1, f1
36400 // t3 = CMOV cond1 t2, f2
36401 //
36402 // when rewriting this into PHIs, we have to perform some renaming on the
36403 // temps since you cannot have a PHI operand refer to a PHI result earlier
36404 // in the same block. The "simple" but wrong lowering would be:
36405 //
36406 // t2 = PHI t1(BB1), f1(BB2)
36407 // t3 = PHI t2(BB1), f2(BB2)
36408 //
36409 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36410 // renaming is to note that on the path through BB1, t2 is really just a
36411 // copy of t1, and do that renaming, properly generating:
36412 //
36413 // t2 = PHI t1(BB1), f1(BB2)
36414 // t3 = PHI t1(BB1), f2(BB2)
36415 //
36416 // Case 2:
36417 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36418 // function - EmitLoweredCascadedSelect.
36419
36420 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36422 MachineInstr *LastCMOV = &MI;
36424
36425 // Check for case 1, where there are multiple CMOVs with the same condition
36426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36427 // number of jumps the most.
36428
36429 if (isCMOVPseudo(MI)) {
36430 // See if we have a string of CMOVS with the same condition. Skip over
36431 // intervening debug insts.
36432 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36433 (NextMIIt->getOperand(3).getImm() == CC ||
36434 NextMIIt->getOperand(3).getImm() == OppCC)) {
36435 LastCMOV = &*NextMIIt;
36436 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36437 }
36438 }
36439
36440 // This checks for case 2, but only do this if we didn't already find
36441 // case 1, as indicated by LastCMOV == MI.
36442 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36443 NextMIIt->getOpcode() == MI.getOpcode() &&
36444 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36445 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36446 NextMIIt->getOperand(1).isKill()) {
36447 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36448 }
36449
36450 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36451 MachineFunction *F = ThisMBB->getParent();
36452 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36453 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36454
36455 MachineFunction::iterator It = ++ThisMBB->getIterator();
36456 F->insert(It, FalseMBB);
36457 F->insert(It, SinkMBB);
36458
36459 // Set the call frame size on entry to the new basic blocks.
36460 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36461 FalseMBB->setCallFrameSize(CallFrameSize);
36462 SinkMBB->setCallFrameSize(CallFrameSize);
36463
36464 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36465 // live into the sink and copy blocks.
36466 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36467 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36468 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36469 FalseMBB->addLiveIn(X86::EFLAGS);
36470 SinkMBB->addLiveIn(X86::EFLAGS);
36471 }
36472
36473 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36475 MachineBasicBlock::iterator(LastCMOV));
36476 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36477 if (MI.isDebugInstr())
36478 SinkMBB->push_back(MI.removeFromParent());
36479
36480 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36481 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36482 std::next(MachineBasicBlock::iterator(LastCMOV)),
36483 ThisMBB->end());
36484 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36485
36486 // Fallthrough block for ThisMBB.
36487 ThisMBB->addSuccessor(FalseMBB);
36488 // The true block target of the first (or only) branch is always a SinkMBB.
36489 ThisMBB->addSuccessor(SinkMBB);
36490 // Fallthrough block for FalseMBB.
36491 FalseMBB->addSuccessor(SinkMBB);
36492
36493 // Create the conditional branch instruction.
36494 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36495
36496 // SinkMBB:
36497 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36498 // ...
36501 std::next(MachineBasicBlock::iterator(LastCMOV));
36502 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36503
36504 // Now remove the CMOV(s).
36505 ThisMBB->erase(MIItBegin, MIItEnd);
36506
36507 return SinkMBB;
36508}
36509
36510static unsigned getSUBriOpcode(bool IsLP64) {
36511 if (IsLP64)
36512 return X86::SUB64ri32;
36513 else
36514 return X86::SUB32ri;
36515}
36516
36518X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36519 MachineBasicBlock *MBB) const {
36520 MachineFunction *MF = MBB->getParent();
36521 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36522 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36523 const MIMetadata MIMD(MI);
36524 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36525
36526 const unsigned ProbeSize = getStackProbeSize(*MF);
36527
36528 MachineRegisterInfo &MRI = MF->getRegInfo();
36529 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36531 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36532
36534 MF->insert(MBBIter, testMBB);
36535 MF->insert(MBBIter, blockMBB);
36536 MF->insert(MBBIter, tailMBB);
36537
36538 Register sizeVReg = MI.getOperand(1).getReg();
36539
36540 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36541
36542 Register TmpStackPtr = MRI.createVirtualRegister(
36543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36544 Register FinalStackPtr = MRI.createVirtualRegister(
36545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36546
36547 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36548 .addReg(physSPReg);
36549 {
36550 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36551 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36552 .addReg(TmpStackPtr)
36553 .addReg(sizeVReg);
36554 }
36555
36556 // test rsp size
36557
36558 BuildMI(testMBB, MIMD,
36559 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36560 .addReg(FinalStackPtr)
36561 .addReg(physSPReg);
36562
36563 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36564 .addMBB(tailMBB)
36566 testMBB->addSuccessor(blockMBB);
36567 testMBB->addSuccessor(tailMBB);
36568
36569 // Touch the block then extend it. This is done on the opposite side of
36570 // static probe where we allocate then touch, to avoid the need of probing the
36571 // tail of the static alloca. Possible scenarios are:
36572 //
36573 // + ---- <- ------------ <- ------------- <- ------------ +
36574 // | |
36575 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36576 // | |
36577 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36578 //
36579 // The property we want to enforce is to never have more than [page alloc] between two probes.
36580
36581 const unsigned XORMIOpc =
36582 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36583 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36584 .addImm(0);
36585
36586 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36587 physSPReg)
36588 .addReg(physSPReg)
36589 .addImm(ProbeSize);
36590
36591 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36592 blockMBB->addSuccessor(testMBB);
36593
36594 // Replace original instruction by the expected stack ptr
36595 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36596 MI.getOperand(0).getReg())
36597 .addReg(FinalStackPtr);
36598
36599 tailMBB->splice(tailMBB->end(), MBB,
36600 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36602 MBB->addSuccessor(testMBB);
36603
36604 // Delete the original pseudo instruction.
36605 MI.eraseFromParent();
36606
36607 // And we're done.
36608 return tailMBB;
36609}
36610
36612X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36613 MachineBasicBlock *BB) const {
36614 MachineFunction *MF = BB->getParent();
36615 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36616 const MIMetadata MIMD(MI);
36617 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36618
36619 assert(MF->shouldSplitStack());
36620
36621 const bool Is64Bit = Subtarget.is64Bit();
36622 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36623
36624 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36625 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36626
36627 // BB:
36628 // ... [Till the alloca]
36629 // If stacklet is not large enough, jump to mallocMBB
36630 //
36631 // bumpMBB:
36632 // Allocate by subtracting from RSP
36633 // Jump to continueMBB
36634 //
36635 // mallocMBB:
36636 // Allocate by call to runtime
36637 //
36638 // continueMBB:
36639 // ...
36640 // [rest of original BB]
36641 //
36642
36643 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36645 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36646
36647 MachineRegisterInfo &MRI = MF->getRegInfo();
36648 const TargetRegisterClass *AddrRegClass =
36650
36651 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36652 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36653 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36654 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36655 sizeVReg = MI.getOperand(1).getReg(),
36656 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36657
36658 MachineFunction::iterator MBBIter = ++BB->getIterator();
36659
36660 MF->insert(MBBIter, bumpMBB);
36661 MF->insert(MBBIter, mallocMBB);
36662 MF->insert(MBBIter, continueMBB);
36663
36664 continueMBB->splice(continueMBB->begin(), BB,
36665 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36666 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36667
36668 // Add code to the main basic block to check if the stack limit has been hit,
36669 // and if so, jump to mallocMBB otherwise to bumpMBB.
36670 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36671 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36672 .addReg(tmpSPVReg).addReg(sizeVReg);
36673 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36674 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36675 .addReg(SPLimitVReg);
36676 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36677
36678 // bumpMBB simply decreases the stack pointer, since we know the current
36679 // stacklet has enough space.
36680 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36681 .addReg(SPLimitVReg);
36682 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36683 .addReg(SPLimitVReg);
36684 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36685
36686 // Calls into a routine in libgcc to allocate more space from the heap.
36687 const uint32_t *RegMask =
36688 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36689 if (IsLP64) {
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36691 .addReg(sizeVReg);
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36693 .addExternalSymbol("__morestack_allocate_stack_space")
36694 .addRegMask(RegMask)
36695 .addReg(X86::RDI, RegState::Implicit)
36696 .addReg(X86::RAX, RegState::ImplicitDefine);
36697 } else if (Is64Bit) {
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36699 .addReg(sizeVReg);
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36701 .addExternalSymbol("__morestack_allocate_stack_space")
36702 .addRegMask(RegMask)
36703 .addReg(X86::EDI, RegState::Implicit)
36704 .addReg(X86::EAX, RegState::ImplicitDefine);
36705 } else {
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36707 .addImm(12);
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36709 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36710 .addExternalSymbol("__morestack_allocate_stack_space")
36711 .addRegMask(RegMask)
36712 .addReg(X86::EAX, RegState::ImplicitDefine);
36713 }
36714
36715 if (!Is64Bit)
36716 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36717 .addImm(16);
36718
36719 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36720 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36721 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36722
36723 // Set up the CFG correctly.
36724 BB->addSuccessor(bumpMBB);
36725 BB->addSuccessor(mallocMBB);
36726 mallocMBB->addSuccessor(continueMBB);
36727 bumpMBB->addSuccessor(continueMBB);
36728
36729 // Take care of the PHI nodes.
36730 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36731 MI.getOperand(0).getReg())
36732 .addReg(mallocPtrVReg)
36733 .addMBB(mallocMBB)
36734 .addReg(bumpSPPtrVReg)
36735 .addMBB(bumpMBB);
36736
36737 // Delete the original pseudo instruction.
36738 MI.eraseFromParent();
36739
36740 // And we're done.
36741 return continueMBB;
36742}
36743
36745X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36746 MachineBasicBlock *BB) const {
36747 MachineFunction *MF = BB->getParent();
36748 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36749 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36750 const MIMetadata MIMD(MI);
36751
36754 "SEH does not use catchret!");
36755
36756 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36757 if (!Subtarget.is32Bit())
36758 return BB;
36759
36760 // C++ EH creates a new target block to hold the restore code, and wires up
36761 // the new block to the return destination with a normal JMP_4.
36762 MachineBasicBlock *RestoreMBB =
36764 assert(BB->succ_size() == 1);
36765 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36766 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36767 BB->addSuccessor(RestoreMBB);
36768 MI.getOperand(0).setMBB(RestoreMBB);
36769
36770 // Marking this as an EH pad but not a funclet entry block causes PEI to
36771 // restore stack pointers in the block.
36772 RestoreMBB->setIsEHPad(true);
36773
36774 auto RestoreMBBI = RestoreMBB->begin();
36775 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36776 return BB;
36777}
36778
36780X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36781 MachineBasicBlock *BB) const {
36782 // This is pretty easy. We're taking the value that we received from
36783 // our load from the relocation, sticking it in either RDI (x86-64)
36784 // or EAX and doing an indirect call. The return value will then
36785 // be in the normal return register.
36786 MachineFunction *F = BB->getParent();
36787 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36788 const MIMetadata MIMD(MI);
36789
36790 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36791 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36792
36793 // Get a register mask for the lowered call.
36794 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36795 // proper register mask.
36796 const uint32_t *RegMask =
36797 Subtarget.is64Bit() ?
36798 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36799 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36800 if (Subtarget.is64Bit()) {
36801 MachineInstrBuilder MIB =
36802 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36803 .addReg(X86::RIP)
36804 .addImm(0)
36805 .addReg(0)
36806 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36807 MI.getOperand(3).getTargetFlags())
36808 .addReg(0);
36809 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36810 addDirectMem(MIB, X86::RDI);
36811 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36812 } else if (!isPositionIndependent()) {
36813 MachineInstrBuilder MIB =
36814 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36815 .addReg(0)
36816 .addImm(0)
36817 .addReg(0)
36818 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36819 MI.getOperand(3).getTargetFlags())
36820 .addReg(0);
36821 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36822 addDirectMem(MIB, X86::EAX);
36823 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36824 } else {
36825 MachineInstrBuilder MIB =
36826 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36827 .addReg(TII->getGlobalBaseReg(F))
36828 .addImm(0)
36829 .addReg(0)
36830 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36831 MI.getOperand(3).getTargetFlags())
36832 .addReg(0);
36833 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36834 addDirectMem(MIB, X86::EAX);
36835 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36836 }
36837
36838 MI.eraseFromParent(); // The pseudo instruction is gone now.
36839 return BB;
36840}
36841
36842static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36843 switch (RPOpc) {
36844 case X86::INDIRECT_THUNK_CALL32:
36845 return X86::CALLpcrel32;
36846 case X86::INDIRECT_THUNK_CALL64:
36847 return X86::CALL64pcrel32;
36848 case X86::INDIRECT_THUNK_TCRETURN32:
36849 return X86::TCRETURNdi;
36850 case X86::INDIRECT_THUNK_TCRETURN64:
36851 return X86::TCRETURNdi64;
36852 }
36853 llvm_unreachable("not indirect thunk opcode");
36854}
36855
36856static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36857 Register Reg) {
36858 if (Subtarget.useRetpolineExternalThunk()) {
36859 // When using an external thunk for retpolines, we pick names that match the
36860 // names GCC happens to use as well. This helps simplify the implementation
36861 // of the thunks for kernels where they have no easy ability to create
36862 // aliases and are doing non-trivial configuration of the thunk's body. For
36863 // example, the Linux kernel will do boot-time hot patching of the thunk
36864 // bodies and cannot easily export aliases of these to loaded modules.
36865 //
36866 // Note that at any point in the future, we may need to change the semantics
36867 // of how we implement retpolines and at that time will likely change the
36868 // name of the called thunk. Essentially, there is no hard guarantee that
36869 // LLVM will generate calls to specific thunks, we merely make a best-effort
36870 // attempt to help out kernels and other systems where duplicating the
36871 // thunks is costly.
36872 switch (Reg.id()) {
36873 case X86::EAX:
36874 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36875 return "__x86_indirect_thunk_eax";
36876 case X86::ECX:
36877 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36878 return "__x86_indirect_thunk_ecx";
36879 case X86::EDX:
36880 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36881 return "__x86_indirect_thunk_edx";
36882 case X86::EDI:
36883 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36884 return "__x86_indirect_thunk_edi";
36885 case X86::R11:
36886 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36887 return "__x86_indirect_thunk_r11";
36888 }
36889 llvm_unreachable("unexpected reg for external indirect thunk");
36890 }
36891
36892 if (Subtarget.useRetpolineIndirectCalls() ||
36893 Subtarget.useRetpolineIndirectBranches()) {
36894 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36895 switch (Reg.id()) {
36896 case X86::EAX:
36897 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36898 return "__llvm_retpoline_eax";
36899 case X86::ECX:
36900 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36901 return "__llvm_retpoline_ecx";
36902 case X86::EDX:
36903 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36904 return "__llvm_retpoline_edx";
36905 case X86::EDI:
36906 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36907 return "__llvm_retpoline_edi";
36908 case X86::R11:
36909 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36910 return "__llvm_retpoline_r11";
36911 }
36912 llvm_unreachable("unexpected reg for retpoline");
36913 }
36914
36915 if (Subtarget.useLVIControlFlowIntegrity()) {
36916 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36917 return "__llvm_lvi_thunk_r11";
36918 }
36919 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36920}
36921
36923X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36924 MachineBasicBlock *BB) const {
36925 // Copy the virtual register into the R11 physical register and
36926 // call the retpoline thunk.
36927 const MIMetadata MIMD(MI);
36928 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36929 Register CalleeVReg = MI.getOperand(0).getReg();
36930 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36931
36932 // Find an available scratch register to hold the callee. On 64-bit, we can
36933 // just use R11, but we scan for uses anyway to ensure we don't generate
36934 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36935 // already a register use operand to the call to hold the callee. If none
36936 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36937 // register and ESI is the base pointer to realigned stack frames with VLAs.
36938 SmallVector<Register, 3> AvailableRegs;
36939 if (Subtarget.is64Bit())
36940 AvailableRegs.push_back(X86::R11);
36941 else
36942 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36943
36944 // Zero out any registers that are already used.
36945 for (const auto &MO : MI.operands()) {
36946 if (MO.isReg() && MO.isUse())
36947 llvm::replace(AvailableRegs, MO.getReg(), Register());
36948 }
36949
36950 // Choose the first remaining non-zero available register.
36951 Register AvailableReg;
36952 for (Register MaybeReg : AvailableRegs) {
36953 if (MaybeReg) {
36954 AvailableReg = MaybeReg;
36955 break;
36956 }
36957 }
36958 if (!AvailableReg)
36959 report_fatal_error("calling convention incompatible with retpoline, no "
36960 "available registers");
36961
36962 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36963
36964 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36965 .addReg(CalleeVReg);
36966 MI.getOperand(0).ChangeToES(Symbol);
36967 MI.setDesc(TII->get(Opc));
36968 MachineInstrBuilder(*BB->getParent(), &MI)
36969 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36970 return BB;
36971}
36972
36973/// SetJmp implies future control flow change upon calling the corresponding
36974/// LongJmp.
36975/// Instead of using the 'return' instruction, the long jump fixes the stack and
36976/// performs an indirect branch. To do so it uses the registers that were stored
36977/// in the jump buffer (when calling SetJmp).
36978/// In case the shadow stack is enabled we need to fix it as well, because some
36979/// return addresses will be skipped.
36980/// The function will save the SSP for future fixing in the function
36981/// emitLongJmpShadowStackFix.
36982/// \sa emitLongJmpShadowStackFix
36983/// \param [in] MI The temporary Machine Instruction for the builtin.
36984/// \param [in] MBB The Machine Basic Block that will be modified.
36985void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36986 MachineBasicBlock *MBB) const {
36987 const MIMetadata MIMD(MI);
36988 MachineFunction *MF = MBB->getParent();
36989 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36990 MachineRegisterInfo &MRI = MF->getRegInfo();
36991 MachineInstrBuilder MIB;
36992
36993 // Memory Reference.
36994 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36995
36996 // Initialize a register with zero.
36997 MVT PVT = getPointerTy(MF->getDataLayout());
36998 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36999 Register ZReg = MRI.createVirtualRegister(PtrRC);
37000 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37001 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37002 .addDef(ZReg)
37003 .addReg(ZReg, RegState::Undef)
37004 .addReg(ZReg, RegState::Undef);
37005
37006 // Read the current SSP Register value to the zeroed register.
37007 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37008 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37009 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37010
37011 // Write the SSP register value to offset 3 in input memory buffer.
37012 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37013 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37014 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37015 const unsigned MemOpndSlot = 1;
37016 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37017 if (i == X86::AddrDisp)
37018 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37019 else
37020 MIB.add(MI.getOperand(MemOpndSlot + i));
37021 }
37022 MIB.addReg(SSPCopyReg);
37023 MIB.setMemRefs(MMOs);
37024}
37025
37027X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37028 MachineBasicBlock *MBB) const {
37029 const MIMetadata MIMD(MI);
37030 MachineFunction *MF = MBB->getParent();
37031 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37032 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37033 MachineRegisterInfo &MRI = MF->getRegInfo();
37034
37035 const BasicBlock *BB = MBB->getBasicBlock();
37037
37038 // Memory Reference
37039 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37040
37041 unsigned MemOpndSlot = 0;
37042
37043 unsigned CurOp = 0;
37044
37045 Register DstReg = MI.getOperand(CurOp++).getReg();
37046 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37047 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37048 (void)TRI;
37049 Register mainDstReg = MRI.createVirtualRegister(RC);
37050 Register restoreDstReg = MRI.createVirtualRegister(RC);
37051
37052 MemOpndSlot = CurOp;
37053
37054 MVT PVT = getPointerTy(MF->getDataLayout());
37055 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37056 "Invalid Pointer Size!");
37057
37058 // For v = setjmp(buf), we generate
37059 //
37060 // thisMBB:
37061 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37062 // SjLjSetup restoreMBB
37063 //
37064 // mainMBB:
37065 // v_main = 0
37066 //
37067 // sinkMBB:
37068 // v = phi(main, restore)
37069 //
37070 // restoreMBB:
37071 // if base pointer being used, load it from frame
37072 // v_restore = 1
37073
37074 MachineBasicBlock *thisMBB = MBB;
37075 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37076 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37077 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37078 MF->insert(I, mainMBB);
37079 MF->insert(I, sinkMBB);
37080 MF->push_back(restoreMBB);
37081 restoreMBB->setMachineBlockAddressTaken();
37082
37083 MachineInstrBuilder MIB;
37084
37085 // Transfer the remainder of BB and its successor edges to sinkMBB.
37086 sinkMBB->splice(sinkMBB->begin(), MBB,
37087 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37089
37090 // thisMBB:
37091 unsigned PtrStoreOpc = 0;
37092 Register LabelReg;
37093 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37094 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37096
37097 // Prepare IP either in reg or imm.
37098 if (!UseImmLabel) {
37099 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37100 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37101 LabelReg = MRI.createVirtualRegister(PtrRC);
37102 if (Subtarget.is64Bit()) {
37103 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37104 .addReg(X86::RIP)
37105 .addImm(0)
37106 .addReg(0)
37107 .addMBB(restoreMBB)
37108 .addReg(0);
37109 } else {
37110 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37111 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37112 .addReg(XII->getGlobalBaseReg(MF))
37113 .addImm(0)
37114 .addReg(0)
37115 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37116 .addReg(0);
37117 }
37118 } else
37119 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37120 // Store IP
37121 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37122 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37123 if (i == X86::AddrDisp)
37124 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37125 else
37126 MIB.add(MI.getOperand(MemOpndSlot + i));
37127 }
37128 if (!UseImmLabel)
37129 MIB.addReg(LabelReg);
37130 else
37131 MIB.addMBB(restoreMBB);
37132 MIB.setMemRefs(MMOs);
37133
37134 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37135 emitSetJmpShadowStackFix(MI, thisMBB);
37136 }
37137
37138 // Setup
37139 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37140 .addMBB(restoreMBB);
37141
37142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37143 MIB.addRegMask(RegInfo->getNoPreservedMask());
37144 thisMBB->addSuccessor(mainMBB);
37145 thisMBB->addSuccessor(restoreMBB);
37146
37147 // mainMBB:
37148 // EAX = 0
37149 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37150 mainMBB->addSuccessor(sinkMBB);
37151
37152 // sinkMBB:
37153 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37154 .addReg(mainDstReg)
37155 .addMBB(mainMBB)
37156 .addReg(restoreDstReg)
37157 .addMBB(restoreMBB);
37158
37159 // restoreMBB:
37160 if (RegInfo->hasBasePointer(*MF)) {
37161 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37162 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37163 X86FI->setRestoreBasePointer(MF);
37164 Register FramePtr = RegInfo->getFrameRegister(*MF);
37165 Register BasePtr = RegInfo->getBaseRegister();
37166 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37167 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37168 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37170 }
37171 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37172 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37173 restoreMBB->addSuccessor(sinkMBB);
37174
37175 MI.eraseFromParent();
37176 return sinkMBB;
37177}
37178
37179/// Fix the shadow stack using the previously saved SSP pointer.
37180/// \sa emitSetJmpShadowStackFix
37181/// \param [in] MI The temporary Machine Instruction for the builtin.
37182/// \param [in] MBB The Machine Basic Block that will be modified.
37183/// \return The sink MBB that will perform the future indirect branch.
37185X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37186 MachineBasicBlock *MBB) const {
37187 const MIMetadata MIMD(MI);
37188 MachineFunction *MF = MBB->getParent();
37189 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37190 MachineRegisterInfo &MRI = MF->getRegInfo();
37191
37192 // Memory Reference
37193 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37194
37195 MVT PVT = getPointerTy(MF->getDataLayout());
37196 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37197
37198 // checkSspMBB:
37199 // xor vreg1, vreg1
37200 // rdssp vreg1
37201 // test vreg1, vreg1
37202 // je sinkMBB # Jump if Shadow Stack is not supported
37203 // fallMBB:
37204 // mov buf+24/12(%rip), vreg2
37205 // sub vreg1, vreg2
37206 // jbe sinkMBB # No need to fix the Shadow Stack
37207 // fixShadowMBB:
37208 // shr 3/2, vreg2
37209 // incssp vreg2 # fix the SSP according to the lower 8 bits
37210 // shr 8, vreg2
37211 // je sinkMBB
37212 // fixShadowLoopPrepareMBB:
37213 // shl vreg2
37214 // mov 128, vreg3
37215 // fixShadowLoopMBB:
37216 // incssp vreg3
37217 // dec vreg2
37218 // jne fixShadowLoopMBB # Iterate until you finish fixing
37219 // # the Shadow Stack
37220 // sinkMBB:
37221
37223 const BasicBlock *BB = MBB->getBasicBlock();
37224
37225 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37229 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37230 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37231 MF->insert(I, checkSspMBB);
37232 MF->insert(I, fallMBB);
37233 MF->insert(I, fixShadowMBB);
37234 MF->insert(I, fixShadowLoopPrepareMBB);
37235 MF->insert(I, fixShadowLoopMBB);
37236 MF->insert(I, sinkMBB);
37237
37238 // Transfer the remainder of BB and its successor edges to sinkMBB.
37239 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37240 MBB->end());
37242
37243 MBB->addSuccessor(checkSspMBB);
37244
37245 // Initialize a register with zero.
37246 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37247 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37248
37249 if (PVT == MVT::i64) {
37250 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37251 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37252 .addImm(0)
37253 .addReg(ZReg)
37254 .addImm(X86::sub_32bit);
37255 ZReg = TmpZReg;
37256 }
37257
37258 // Read the current SSP Register value to the zeroed register.
37259 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37260 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37261 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37262
37263 // Check whether the result of the SSP register is zero and jump directly
37264 // to the sink.
37265 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37266 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37267 .addReg(SSPCopyReg)
37268 .addReg(SSPCopyReg);
37269 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37270 .addMBB(sinkMBB)
37272 checkSspMBB->addSuccessor(sinkMBB);
37273 checkSspMBB->addSuccessor(fallMBB);
37274
37275 // Reload the previously saved SSP register value.
37276 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37277 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37278 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37279 MachineInstrBuilder MIB =
37280 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37281 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37282 const MachineOperand &MO = MI.getOperand(i);
37283 if (i == X86::AddrDisp)
37284 MIB.addDisp(MO, SPPOffset);
37285 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37286 // preserve kill flags.
37287 MIB.addReg(MO.getReg());
37288 else
37289 MIB.add(MO);
37290 }
37291 MIB.setMemRefs(MMOs);
37292
37293 // Subtract the current SSP from the previous SSP.
37294 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37295 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37296 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37297 .addReg(PrevSSPReg)
37298 .addReg(SSPCopyReg);
37299
37300 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37301 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37302 .addMBB(sinkMBB)
37304 fallMBB->addSuccessor(sinkMBB);
37305 fallMBB->addSuccessor(fixShadowMBB);
37306
37307 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37308 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37309 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37310 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37311 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37312 .addReg(SspSubReg)
37313 .addImm(Offset);
37314
37315 // Increase SSP when looking only on the lower 8 bits of the delta.
37316 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37317 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37318
37319 // Reset the lower 8 bits.
37320 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37321 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37322 .addReg(SspFirstShrReg)
37323 .addImm(8);
37324
37325 // Jump if the result of the shift is zero.
37326 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37327 .addMBB(sinkMBB)
37329 fixShadowMBB->addSuccessor(sinkMBB);
37330 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37331
37332 // Do a single shift left.
37333 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37334 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37335 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37336 .addReg(SspSecondShrReg)
37337 .addImm(1);
37338
37339 // Save the value 128 to a register (will be used next with incssp).
37340 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37341 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37342 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37343 .addImm(128);
37344 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37345
37346 // Since incssp only looks at the lower 8 bits, we might need to do several
37347 // iterations of incssp until we finish fixing the shadow stack.
37348 Register DecReg = MRI.createVirtualRegister(PtrRC);
37349 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37350 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37351 .addReg(SspAfterShlReg)
37352 .addMBB(fixShadowLoopPrepareMBB)
37353 .addReg(DecReg)
37354 .addMBB(fixShadowLoopMBB);
37355
37356 // Every iteration we increase the SSP by 128.
37357 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37358
37359 // Every iteration we decrement the counter by 1.
37360 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37361 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37362
37363 // Jump if the counter is not zero yet.
37364 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37365 .addMBB(fixShadowLoopMBB)
37367 fixShadowLoopMBB->addSuccessor(sinkMBB);
37368 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37369
37370 return sinkMBB;
37371}
37372
37374X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37375 MachineBasicBlock *MBB) const {
37376 const MIMetadata MIMD(MI);
37377 MachineFunction *MF = MBB->getParent();
37378 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37379 MachineRegisterInfo &MRI = MF->getRegInfo();
37380
37381 // Memory Reference
37382 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37383
37384 MVT PVT = getPointerTy(MF->getDataLayout());
37385 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37386 "Invalid Pointer Size!");
37387
37388 const TargetRegisterClass *RC =
37389 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37390 Register Tmp = MRI.createVirtualRegister(RC);
37391 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37392 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37393 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37394 Register SP = RegInfo->getStackRegister();
37395
37396 MachineInstrBuilder MIB;
37397
37398 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37399 const int64_t SPOffset = 2 * PVT.getStoreSize();
37400
37401 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37402 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37403
37404 MachineBasicBlock *thisMBB = MBB;
37405
37406 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37407 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37408 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37409 }
37410
37411 // Reload FP
37412 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37414 const MachineOperand &MO = MI.getOperand(i);
37415 if (MO.isReg()) // Don't add the whole operand, we don't want to
37416 // preserve kill flags.
37417 MIB.addReg(MO.getReg());
37418 else
37419 MIB.add(MO);
37420 }
37421 MIB.setMemRefs(MMOs);
37423
37424 // Reload IP
37425 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37426 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37427 const MachineOperand &MO = MI.getOperand(i);
37428 if (i == X86::AddrDisp)
37429 MIB.addDisp(MO, LabelOffset);
37430 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37431 // preserve kill flags.
37432 MIB.addReg(MO.getReg());
37433 else
37434 MIB.add(MO);
37435 }
37436 MIB.setMemRefs(MMOs);
37437
37438 // Reload SP
37439 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37440 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37441 if (i == X86::AddrDisp)
37442 MIB.addDisp(MI.getOperand(i), SPOffset);
37443 else
37444 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37445 // the last instruction of the expansion.
37446 }
37447 MIB.setMemRefs(MMOs);
37449
37450 // Jump
37451 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37452
37453 MI.eraseFromParent();
37454 return thisMBB;
37455}
37456
37457void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37459 MachineBasicBlock *DispatchBB,
37460 int FI) const {
37461 const MIMetadata MIMD(MI);
37462 MachineFunction *MF = MBB->getParent();
37463 MachineRegisterInfo *MRI = &MF->getRegInfo();
37464 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37465
37466 MVT PVT = getPointerTy(MF->getDataLayout());
37467 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37468
37469 unsigned Op = 0;
37470 Register VR;
37471
37472 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37474
37475 if (UseImmLabel) {
37476 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37477 } else {
37478 const TargetRegisterClass *TRC =
37479 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37480 VR = MRI->createVirtualRegister(TRC);
37481 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37482
37483 if (Subtarget.is64Bit())
37484 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37485 .addReg(X86::RIP)
37486 .addImm(1)
37487 .addReg(0)
37488 .addMBB(DispatchBB)
37489 .addReg(0);
37490 else
37491 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37492 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37493 .addImm(1)
37494 .addReg(0)
37495 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37496 .addReg(0);
37497 }
37498
37499 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37500 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37501 if (UseImmLabel)
37502 MIB.addMBB(DispatchBB);
37503 else
37504 MIB.addReg(VR);
37505}
37506
37508X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37509 MachineBasicBlock *BB) const {
37510 const MIMetadata MIMD(MI);
37511 MachineFunction *MF = BB->getParent();
37512 MachineRegisterInfo *MRI = &MF->getRegInfo();
37513 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37514 int FI = MF->getFrameInfo().getFunctionContextIndex();
37515
37516 // Get a mapping of the call site numbers to all of the landing pads they're
37517 // associated with.
37518 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37519 unsigned MaxCSNum = 0;
37520 for (auto &MBB : *MF) {
37521 if (!MBB.isEHPad())
37522 continue;
37523
37524 MCSymbol *Sym = nullptr;
37525 for (const auto &MI : MBB) {
37526 if (MI.isDebugInstr())
37527 continue;
37528
37529 assert(MI.isEHLabel() && "expected EH_LABEL");
37530 Sym = MI.getOperand(0).getMCSymbol();
37531 break;
37532 }
37533
37534 if (!MF->hasCallSiteLandingPad(Sym))
37535 continue;
37536
37537 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37538 CallSiteNumToLPad[CSI].push_back(&MBB);
37539 MaxCSNum = std::max(MaxCSNum, CSI);
37540 }
37541 }
37542
37543 // Get an ordered list of the machine basic blocks for the jump table.
37544 std::vector<MachineBasicBlock *> LPadList;
37545 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37546 LPadList.reserve(CallSiteNumToLPad.size());
37547
37548 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37549 for (auto &LP : CallSiteNumToLPad[CSI]) {
37550 LPadList.push_back(LP);
37551 InvokeBBs.insert_range(LP->predecessors());
37552 }
37553 }
37554
37555 assert(!LPadList.empty() &&
37556 "No landing pad destinations for the dispatch jump table!");
37557
37558 // Create the MBBs for the dispatch code.
37559
37560 // Shove the dispatch's address into the return slot in the function context.
37561 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37562 DispatchBB->setIsEHPad(true);
37563
37564 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37565 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37566 DispatchBB->addSuccessor(TrapBB);
37567
37568 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37569 DispatchBB->addSuccessor(DispContBB);
37570
37571 // Insert MBBs.
37572 MF->push_back(DispatchBB);
37573 MF->push_back(DispContBB);
37574 MF->push_back(TrapBB);
37575
37576 // Insert code into the entry block that creates and registers the function
37577 // context.
37578 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37579
37580 // Create the jump table and associated information
37581 unsigned JTE = getJumpTableEncoding();
37582 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37583 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37584
37585 const X86RegisterInfo &RI = TII->getRegisterInfo();
37586 // Add a register mask with no preserved registers. This results in all
37587 // registers being marked as clobbered.
37588 if (RI.hasBasePointer(*MF)) {
37589 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37590 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37591 MFI->setRestoreBasePointer(MF);
37592
37593 Register FP = RI.getFrameRegister(*MF);
37594 Register BP = RI.getBaseRegister();
37595 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37596 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37599 } else {
37600 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37602 }
37603
37604 // IReg is used as an index in a memory operand and therefore can't be SP
37605 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37606 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37607 Subtarget.is64Bit() ? 8 : 4);
37608 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37609 .addReg(IReg)
37610 .addImm(LPadList.size());
37611 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37612 .addMBB(TrapBB)
37614
37615 if (Subtarget.is64Bit()) {
37616 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37617 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37618
37619 // leaq .LJTI0_0(%rip), BReg
37620 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37621 .addReg(X86::RIP)
37622 .addImm(1)
37623 .addReg(0)
37624 .addJumpTableIndex(MJTI)
37625 .addReg(0);
37626 // movzx IReg64, IReg
37627 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37628 .addImm(0)
37629 .addReg(IReg)
37630 .addImm(X86::sub_32bit);
37631
37632 switch (JTE) {
37634 // jmpq *(BReg,IReg64,8)
37635 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37636 .addReg(BReg)
37637 .addImm(8)
37638 .addReg(IReg64)
37639 .addImm(0)
37640 .addReg(0);
37641 break;
37643 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37644 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37645 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37646
37647 // movl (BReg,IReg64,4), OReg
37648 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37649 .addReg(BReg)
37650 .addImm(4)
37651 .addReg(IReg64)
37652 .addImm(0)
37653 .addReg(0);
37654 // movsx OReg64, OReg
37655 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37656 .addReg(OReg);
37657 // addq BReg, OReg64, TReg
37658 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37659 .addReg(OReg64)
37660 .addReg(BReg);
37661 // jmpq *TReg
37662 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37663 break;
37664 }
37665 default:
37666 llvm_unreachable("Unexpected jump table encoding");
37667 }
37668 } else {
37669 // jmpl *.LJTI0_0(,IReg,4)
37670 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37671 .addReg(0)
37672 .addImm(4)
37673 .addReg(IReg)
37674 .addJumpTableIndex(MJTI)
37675 .addReg(0);
37676 }
37677
37678 // Add the jump table entries as successors to the MBB.
37679 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37680 for (auto &LP : LPadList)
37681 if (SeenMBBs.insert(LP).second)
37682 DispContBB->addSuccessor(LP);
37683
37684 // N.B. the order the invoke BBs are processed in doesn't matter here.
37686 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37687 for (MachineBasicBlock *MBB : InvokeBBs) {
37688 // Remove the landing pad successor from the invoke block and replace it
37689 // with the new dispatch block.
37690 // Keep a copy of Successors since it's modified inside the loop.
37691 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37692 MBB->succ_rend());
37693 // FIXME: Avoid quadratic complexity.
37694 for (auto *MBBS : Successors) {
37695 if (MBBS->isEHPad()) {
37696 MBB->removeSuccessor(MBBS);
37697 MBBLPads.push_back(MBBS);
37698 }
37699 }
37700
37701 MBB->addSuccessor(DispatchBB);
37702
37703 // Find the invoke call and mark all of the callee-saved registers as
37704 // 'implicit defined' so that they're spilled. This prevents code from
37705 // moving instructions to before the EH block, where they will never be
37706 // executed.
37707 for (auto &II : reverse(*MBB)) {
37708 if (!II.isCall())
37709 continue;
37710
37711 DenseSet<Register> DefRegs;
37712 for (auto &MOp : II.operands())
37713 if (MOp.isReg())
37714 DefRegs.insert(MOp.getReg());
37715
37716 MachineInstrBuilder MIB(*MF, &II);
37717 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37718 Register Reg = SavedRegs[RegIdx];
37719 if (!DefRegs.contains(Reg))
37721 }
37722
37723 break;
37724 }
37725 }
37726
37727 // Mark all former landing pads as non-landing pads. The dispatch is the only
37728 // landing pad now.
37729 for (auto &LP : MBBLPads)
37730 LP->setIsEHPad(false);
37731
37732 // The instruction is gone now.
37733 MI.eraseFromParent();
37734 return BB;
37735}
37736
37738X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37739 MachineBasicBlock *BB) const {
37740 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37741 // calls may require proper stack alignment.
37742 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37743 const MIMetadata MIMD(MI);
37744 MachineFunction &MF = *BB->getParent();
37745
37746 // Emit CALLSEQ_START right before the instruction.
37747 MF.getFrameInfo().setAdjustsStack(true);
37748 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37749 MachineInstrBuilder CallseqStart =
37750 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37751 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37752
37753 // Emit CALLSEQ_END right after the instruction.
37754 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37755 MachineInstrBuilder CallseqEnd =
37756 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37757 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37758
37759 return BB;
37760}
37761
37764 MachineBasicBlock *BB) const {
37765 MachineFunction *MF = BB->getParent();
37766 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37767 const MIMetadata MIMD(MI);
37768
37769 auto TMMImmToTMMReg = [](unsigned Imm) {
37770 assert (Imm < 8 && "Illegal tmm index");
37771 return X86::TMM0 + Imm;
37772 };
37773 auto TMMImmToTMMPair = [](unsigned Imm) {
37774 assert(Imm < 8 && "Illegal tmm pair index.");
37775 return X86::TMM0_TMM1 + Imm / 2;
37776 };
37777 switch (MI.getOpcode()) {
37778 default:
37779 llvm_unreachable("Unexpected instr type to insert");
37780 case X86::INDIRECT_THUNK_CALL32:
37781 case X86::INDIRECT_THUNK_CALL64:
37782 case X86::INDIRECT_THUNK_TCRETURN32:
37783 case X86::INDIRECT_THUNK_TCRETURN64:
37784 return EmitLoweredIndirectThunk(MI, BB);
37785 case X86::CATCHRET:
37786 return EmitLoweredCatchRet(MI, BB);
37787 case X86::SEG_ALLOCA_32:
37788 case X86::SEG_ALLOCA_64:
37789 return EmitLoweredSegAlloca(MI, BB);
37790 case X86::PROBED_ALLOCA_32:
37791 case X86::PROBED_ALLOCA_64:
37792 return EmitLoweredProbedAlloca(MI, BB);
37793 case X86::TLSCall_32:
37794 case X86::TLSCall_64:
37795 return EmitLoweredTLSCall(MI, BB);
37796 case X86::CMOV_FR16:
37797 case X86::CMOV_FR16X:
37798 case X86::CMOV_FR32:
37799 case X86::CMOV_FR32X:
37800 case X86::CMOV_FR64:
37801 case X86::CMOV_FR64X:
37802 case X86::CMOV_GR8:
37803 case X86::CMOV_GR16:
37804 case X86::CMOV_GR32:
37805 case X86::CMOV_RFP32:
37806 case X86::CMOV_RFP64:
37807 case X86::CMOV_RFP80:
37808 case X86::CMOV_VR64:
37809 case X86::CMOV_VR128:
37810 case X86::CMOV_VR128X:
37811 case X86::CMOV_VR256:
37812 case X86::CMOV_VR256X:
37813 case X86::CMOV_VR512:
37814 case X86::CMOV_VK1:
37815 case X86::CMOV_VK2:
37816 case X86::CMOV_VK4:
37817 case X86::CMOV_VK8:
37818 case X86::CMOV_VK16:
37819 case X86::CMOV_VK32:
37820 case X86::CMOV_VK64:
37821 return EmitLoweredSelect(MI, BB);
37822
37823 case X86::FP80_ADDr:
37824 case X86::FP80_ADDm32: {
37825 // Change the floating point control register to use double extended
37826 // precision when performing the addition.
37827 int OrigCWFrameIdx =
37828 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37829 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37830 OrigCWFrameIdx);
37831
37832 // Load the old value of the control word...
37833 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37834 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37835 OrigCWFrameIdx);
37836
37837 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37838 // precision.
37839 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37840 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37841 .addReg(OldCW, RegState::Kill)
37842 .addImm(0x300);
37843
37844 // Extract to 16 bits.
37845 Register NewCW16 =
37846 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37847 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37848 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37849
37850 // Prepare memory for FLDCW.
37851 int NewCWFrameIdx =
37852 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37853 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37854 NewCWFrameIdx)
37855 .addReg(NewCW16, RegState::Kill);
37856
37857 // Reload the modified control word now...
37858 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37859 NewCWFrameIdx);
37860
37861 // Do the addition.
37862 if (MI.getOpcode() == X86::FP80_ADDr) {
37863 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37864 .add(MI.getOperand(0))
37865 .add(MI.getOperand(1))
37866 .add(MI.getOperand(2));
37867 } else {
37868 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37869 .add(MI.getOperand(0))
37870 .add(MI.getOperand(1))
37871 .add(MI.getOperand(2))
37872 .add(MI.getOperand(3))
37873 .add(MI.getOperand(4))
37874 .add(MI.getOperand(5))
37875 .add(MI.getOperand(6));
37876 }
37877
37878 // Reload the original control word now.
37879 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37880 OrigCWFrameIdx);
37881
37882 MI.eraseFromParent(); // The pseudo instruction is gone now.
37883 return BB;
37884 }
37885
37886 case X86::FP32_TO_INT16_IN_MEM:
37887 case X86::FP32_TO_INT32_IN_MEM:
37888 case X86::FP32_TO_INT64_IN_MEM:
37889 case X86::FP64_TO_INT16_IN_MEM:
37890 case X86::FP64_TO_INT32_IN_MEM:
37891 case X86::FP64_TO_INT64_IN_MEM:
37892 case X86::FP80_TO_INT16_IN_MEM:
37893 case X86::FP80_TO_INT32_IN_MEM:
37894 case X86::FP80_TO_INT64_IN_MEM: {
37895 // Change the floating point control register to use "round towards zero"
37896 // mode when truncating to an integer value.
37897 int OrigCWFrameIdx =
37898 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37899 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37900 OrigCWFrameIdx);
37901
37902 // Load the old value of the control word...
37903 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37904 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37905 OrigCWFrameIdx);
37906
37907 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37908 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37909 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37910 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37911
37912 // Extract to 16 bits.
37913 Register NewCW16 =
37914 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37915 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37916 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37917
37918 // Prepare memory for FLDCW.
37919 int NewCWFrameIdx =
37920 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37921 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37922 NewCWFrameIdx)
37923 .addReg(NewCW16, RegState::Kill);
37924
37925 // Reload the modified control word now...
37926 addFrameReference(BuildMI(*BB, MI, MIMD,
37927 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37928
37929 // Get the X86 opcode to use.
37930 unsigned Opc;
37931 switch (MI.getOpcode()) {
37932 // clang-format off
37933 default: llvm_unreachable("illegal opcode!");
37934 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37935 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37936 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37937 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37938 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37939 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37940 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37941 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37942 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37943 // clang-format on
37944 }
37945
37947 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37948 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37949
37950 // Reload the original control word now.
37951 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37952 OrigCWFrameIdx);
37953
37954 MI.eraseFromParent(); // The pseudo instruction is gone now.
37955 return BB;
37956 }
37957
37958 // xbegin
37959 case X86::XBEGIN:
37960 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37961
37962 case X86::VAARG_64:
37963 case X86::VAARG_X32:
37964 return EmitVAARGWithCustomInserter(MI, BB);
37965
37966 case X86::EH_SjLj_SetJmp32:
37967 case X86::EH_SjLj_SetJmp64:
37968 return emitEHSjLjSetJmp(MI, BB);
37969
37970 case X86::EH_SjLj_LongJmp32:
37971 case X86::EH_SjLj_LongJmp64:
37972 return emitEHSjLjLongJmp(MI, BB);
37973
37974 case X86::Int_eh_sjlj_setup_dispatch:
37975 return EmitSjLjDispatchBlock(MI, BB);
37976
37977 case TargetOpcode::STATEPOINT:
37978 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37979 // this point in the process. We diverge later.
37980 return emitPatchPoint(MI, BB);
37981
37982 case TargetOpcode::STACKMAP:
37983 case TargetOpcode::PATCHPOINT:
37984 return emitPatchPoint(MI, BB);
37985
37986 case TargetOpcode::PATCHABLE_EVENT_CALL:
37987 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37988 return emitPatchableEventCall(MI, BB);
37989
37990 case X86::LCMPXCHG8B: {
37991 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37992 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37993 // requires a memory operand. If it happens that current architecture is
37994 // i686 and for current function we need a base pointer
37995 // - which is ESI for i686 - register allocator would not be able to
37996 // allocate registers for an address in form of X(%reg, %reg, Y)
37997 // - there never would be enough unreserved registers during regalloc
37998 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37999 // We are giving a hand to register allocator by precomputing the address in
38000 // a new vreg using LEA.
38001
38002 // If it is not i686 or there is no base pointer - nothing to do here.
38003 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38004 return BB;
38005
38006 // Even though this code does not necessarily needs the base pointer to
38007 // be ESI, we check for that. The reason: if this assert fails, there are
38008 // some changes happened in the compiler base pointer handling, which most
38009 // probably have to be addressed somehow here.
38010 assert(TRI->getBaseRegister() == X86::ESI &&
38011 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38012 "base pointer in mind");
38013
38015 MVT SPTy = getPointerTy(MF->getDataLayout());
38016 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38017 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38018
38020 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38021 // does not use index register.
38022 if (AM.IndexReg == X86::NoRegister)
38023 return BB;
38024
38025 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38026 // four operand definitions that are E[ABCD] registers. We skip them and
38027 // then insert the LEA.
38028 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38029 while (RMBBI != BB->rend() &&
38030 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38032 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38033 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38034 ++RMBBI;
38035 }
38038 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38039
38040 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38041
38042 return BB;
38043 }
38044 case X86::LCMPXCHG16B_NO_RBX: {
38045 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38046 Register BasePtr = TRI->getBaseRegister();
38047 if (TRI->hasBasePointer(*MF) &&
38048 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38049 if (!BB->isLiveIn(BasePtr))
38050 BB->addLiveIn(BasePtr);
38051 // Save RBX into a virtual register.
38052 Register SaveRBX =
38053 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38054 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38055 .addReg(X86::RBX);
38056 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38058 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38059 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38060 MIB.add(MI.getOperand(Idx));
38061 MIB.add(MI.getOperand(X86::AddrNumOperands));
38062 MIB.addReg(SaveRBX);
38063 } else {
38064 // Simple case, just copy the virtual register to RBX.
38065 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38066 .add(MI.getOperand(X86::AddrNumOperands));
38068 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38069 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38070 MIB.add(MI.getOperand(Idx));
38071 }
38072 MI.eraseFromParent();
38073 return BB;
38074 }
38075 case X86::MWAITX: {
38076 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38077 Register BasePtr = TRI->getBaseRegister();
38078 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38079 // If no need to save the base pointer, we generate MWAITXrrr,
38080 // else we generate pseudo MWAITX_SAVE_RBX.
38081 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38082 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38083 .addReg(MI.getOperand(0).getReg());
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38085 .addReg(MI.getOperand(1).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38087 .addReg(MI.getOperand(2).getReg());
38088 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38089 MI.eraseFromParent();
38090 } else {
38091 if (!BB->isLiveIn(BasePtr)) {
38092 BB->addLiveIn(BasePtr);
38093 }
38094 // Parameters can be copied into ECX and EAX but not EBX yet.
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38096 .addReg(MI.getOperand(0).getReg());
38097 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38098 .addReg(MI.getOperand(1).getReg());
38099 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38100 // Save RBX into a virtual register.
38101 Register SaveRBX =
38102 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38103 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38104 .addReg(X86::RBX);
38105 // Generate mwaitx pseudo.
38106 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38107 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38108 .addDef(Dst) // Destination tied in with SaveRBX.
38109 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38110 .addUse(SaveRBX); // Save of base pointer.
38111 MI.eraseFromParent();
38112 }
38113 return BB;
38114 }
38115 case TargetOpcode::PREALLOCATED_SETUP: {
38116 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38117 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38118 MFI->setHasPreallocatedCall(true);
38119 int64_t PreallocatedId = MI.getOperand(0).getImm();
38120 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38121 assert(StackAdjustment != 0 && "0 stack adjustment");
38122 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38123 << StackAdjustment << "\n");
38124 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38125 .addReg(X86::ESP)
38126 .addImm(StackAdjustment);
38127 MI.eraseFromParent();
38128 return BB;
38129 }
38130 case TargetOpcode::PREALLOCATED_ARG: {
38131 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38132 int64_t PreallocatedId = MI.getOperand(1).getImm();
38133 int64_t ArgIdx = MI.getOperand(2).getImm();
38134 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38135 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38136 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38137 << ", arg offset " << ArgOffset << "\n");
38138 // stack pointer + offset
38139 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38140 MI.getOperand(0).getReg()),
38141 X86::ESP, false, ArgOffset);
38142 MI.eraseFromParent();
38143 return BB;
38144 }
38145 case X86::PTDPBSSD:
38146 case X86::PTDPBSUD:
38147 case X86::PTDPBUSD:
38148 case X86::PTDPBUUD:
38149 case X86::PTDPBF16PS:
38150 case X86::PTDPFP16PS:
38151 case X86::PTCMMIMFP16PS:
38152 case X86::PTCMMRLFP16PS:
38153 case X86::PTDPBF8PS:
38154 case X86::PTDPBHF8PS:
38155 case X86::PTDPHBF8PS:
38156 case X86::PTDPHF8PS:
38157 case X86::PTTDPBF16PS:
38158 case X86::PTTDPFP16PS:
38159 case X86::PTTCMMIMFP16PS:
38160 case X86::PTTCMMRLFP16PS:
38161 case X86::PTCONJTCMMIMFP16PS:
38162 case X86::PTMMULTF32PS:
38163 case X86::PTTMMULTF32PS: {
38164 unsigned Opc;
38165 switch (MI.getOpcode()) {
38166 default: llvm_unreachable("illegal opcode!");
38167 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38168 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38169 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38170 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38171 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38172 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38173 case X86::PTCMMIMFP16PS:
38174 Opc = X86::TCMMIMFP16PS;
38175 break;
38176 case X86::PTCMMRLFP16PS:
38177 Opc = X86::TCMMRLFP16PS;
38178 break;
38179 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38180 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38181 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38182 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38183 case X86::PTTDPBF16PS:
38184 Opc = X86::TTDPBF16PS;
38185 break;
38186 case X86::PTTDPFP16PS:
38187 Opc = X86::TTDPFP16PS;
38188 break;
38189 case X86::PTTCMMIMFP16PS:
38190 Opc = X86::TTCMMIMFP16PS;
38191 break;
38192 case X86::PTTCMMRLFP16PS:
38193 Opc = X86::TTCMMRLFP16PS;
38194 break;
38195 case X86::PTCONJTCMMIMFP16PS:
38196 Opc = X86::TCONJTCMMIMFP16PS;
38197 break;
38198 case X86::PTMMULTF32PS:
38199 Opc = X86::TMMULTF32PS;
38200 break;
38201 case X86::PTTMMULTF32PS:
38202 Opc = X86::TTMMULTF32PS;
38203 break;
38204 }
38205
38206 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38209 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38210 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38211
38212 MI.eraseFromParent(); // The pseudo is gone now.
38213 return BB;
38214 }
38215 case X86::PTILEZERO: {
38216 unsigned Imm = MI.getOperand(0).getImm();
38217 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38218 MI.eraseFromParent(); // The pseudo is gone now.
38219 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38221 return BB;
38222 }
38223 case X86::PTILEZEROV: {
38224 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38226 return BB;
38227 }
38228 case X86::PTILELOADDRS:
38229 case X86::PTILELOADDRST1:
38230 case X86::PTILELOADD:
38231 case X86::PTILELOADDT1:
38232 case X86::PTILESTORED: {
38233 unsigned Opc;
38234 switch (MI.getOpcode()) {
38235 default: llvm_unreachable("illegal opcode!");
38236#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38237 case X86::PTILELOADD:
38238 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38239 break;
38240 case X86::PTILELOADDT1:
38241 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38242 break;
38243 case X86::PTILESTORED:
38244 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38245 break;
38246 case X86::PTILELOADDRS:
38247 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38248 break;
38249 case X86::PTILELOADDRST1:
38250 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38251 break;
38252 }
38253#undef GET_EGPR_IF_ENABLED
38254
38255 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38256 unsigned CurOp = 0;
38257 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38258 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38260
38261 MIB.add(MI.getOperand(CurOp++)); // base
38262 MIB.add(MI.getOperand(CurOp++)); // scale
38263 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38264 MIB.add(MI.getOperand(CurOp++)); // displacement
38265 MIB.add(MI.getOperand(CurOp++)); // segment
38266
38267 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38268 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38270
38271 MI.eraseFromParent(); // The pseudo is gone now.
38272 return BB;
38273 }
38274 case X86::PT2RPNTLVWZ0:
38275 case X86::PT2RPNTLVWZ0T1:
38276 case X86::PT2RPNTLVWZ1:
38277 case X86::PT2RPNTLVWZ1T1:
38278 case X86::PT2RPNTLVWZ0RS:
38279 case X86::PT2RPNTLVWZ0RST1:
38280 case X86::PT2RPNTLVWZ1RS:
38281 case X86::PT2RPNTLVWZ1RST1: {
38282 const DebugLoc &DL = MI.getDebugLoc();
38283 unsigned Opc;
38284#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38285 switch (MI.getOpcode()) {
38286 default:
38287 llvm_unreachable("Unexpected instruction!");
38288 case X86::PT2RPNTLVWZ0:
38289 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38290 break;
38291 case X86::PT2RPNTLVWZ0T1:
38292 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38293 break;
38294 case X86::PT2RPNTLVWZ1:
38295 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38296 break;
38297 case X86::PT2RPNTLVWZ1T1:
38298 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38299 break;
38300 case X86::PT2RPNTLVWZ0RS:
38301 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38302 break;
38303 case X86::PT2RPNTLVWZ0RST1:
38304 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38305 break;
38306 case X86::PT2RPNTLVWZ1RS:
38307 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38308 break;
38309 case X86::PT2RPNTLVWZ1RST1:
38310 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38311 break;
38312 }
38313#undef GET_EGPR_IF_ENABLED
38314 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38315 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38316
38317 MIB.add(MI.getOperand(1)); // base
38318 MIB.add(MI.getOperand(2)); // scale
38319 MIB.add(MI.getOperand(3)); // index
38320 MIB.add(MI.getOperand(4)); // displacement
38321 MIB.add(MI.getOperand(5)); // segment
38322 MI.eraseFromParent(); // The pseudo is gone now.
38323 return BB;
38324 }
38325 case X86::PTTRANSPOSED:
38326 case X86::PTCONJTFP16: {
38327 const DebugLoc &DL = MI.getDebugLoc();
38328 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38329 : X86::TCONJTFP16;
38330
38331 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38332 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38333 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38334
38335 MI.eraseFromParent(); // The pseudo is gone now.
38336 return BB;
38337 }
38338 case X86::PTCVTROWPS2BF16Hrri:
38339 case X86::PTCVTROWPS2BF16Lrri:
38340 case X86::PTCVTROWPS2PHHrri:
38341 case X86::PTCVTROWPS2PHLrri:
38342 case X86::PTCVTROWD2PSrri:
38343 case X86::PTILEMOVROWrri: {
38344 const DebugLoc &DL = MI.getDebugLoc();
38345 unsigned Opc;
38346 switch (MI.getOpcode()) {
38347 default:
38348 llvm_unreachable("Unexpected instruction!");
38349 case X86::PTCVTROWD2PSrri:
38350 Opc = X86::TCVTROWD2PSrri;
38351 break;
38352 case X86::PTCVTROWPS2BF16Hrri:
38353 Opc = X86::TCVTROWPS2BF16Hrri;
38354 break;
38355 case X86::PTCVTROWPS2PHHrri:
38356 Opc = X86::TCVTROWPS2PHHrri;
38357 break;
38358 case X86::PTCVTROWPS2BF16Lrri:
38359 Opc = X86::TCVTROWPS2BF16Lrri;
38360 break;
38361 case X86::PTCVTROWPS2PHLrri:
38362 Opc = X86::TCVTROWPS2PHLrri;
38363 break;
38364 case X86::PTILEMOVROWrri:
38365 Opc = X86::TILEMOVROWrri;
38366 break;
38367 }
38368 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38369 MIB.add(MI.getOperand(0));
38370 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38371 MIB.addImm(MI.getOperand(2).getImm());
38372
38373 MI.eraseFromParent(); // The pseudo is gone now.
38374 return BB;
38375 }
38376 case X86::PTCVTROWPS2BF16Hrre:
38377 case X86::PTCVTROWPS2BF16Lrre:
38378 case X86::PTCVTROWPS2PHHrre:
38379 case X86::PTCVTROWPS2PHLrre:
38380 case X86::PTCVTROWD2PSrre:
38381 case X86::PTILEMOVROWrre: {
38382 const DebugLoc &DL = MI.getDebugLoc();
38383 unsigned Opc;
38384 switch (MI.getOpcode()) {
38385 default:
38386 llvm_unreachable("Unexpected instruction!");
38387 case X86::PTCVTROWD2PSrre:
38388 Opc = X86::TCVTROWD2PSrre;
38389 break;
38390 case X86::PTCVTROWPS2BF16Hrre:
38391 Opc = X86::TCVTROWPS2BF16Hrre;
38392 break;
38393 case X86::PTCVTROWPS2BF16Lrre:
38394 Opc = X86::TCVTROWPS2BF16Lrre;
38395 break;
38396 case X86::PTCVTROWPS2PHHrre:
38397 Opc = X86::TCVTROWPS2PHHrre;
38398 break;
38399 case X86::PTCVTROWPS2PHLrre:
38400 Opc = X86::TCVTROWPS2PHLrre;
38401 break;
38402 case X86::PTILEMOVROWrre:
38403 Opc = X86::TILEMOVROWrre;
38404 break;
38405 }
38406 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38407 MIB.add(MI.getOperand(0));
38408 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38409 MIB.add(MI.getOperand(2));
38410
38411 MI.eraseFromParent(); // The pseudo is gone now.
38412 return BB;
38413 }
38414 }
38415}
38416
38417//===----------------------------------------------------------------------===//
38418// X86 Optimization Hooks
38419//===----------------------------------------------------------------------===//
38420
38421bool
38423 const APInt &DemandedBits,
38424 const APInt &DemandedElts,
38425 TargetLoweringOpt &TLO) const {
38426 EVT VT = Op.getValueType();
38427 unsigned Opcode = Op.getOpcode();
38428 unsigned EltSize = VT.getScalarSizeInBits();
38429
38430 if (VT.isVector()) {
38431 // If the constant is only all signbits in the active bits, then we should
38432 // extend it to the entire constant to allow it act as a boolean constant
38433 // vector.
38434 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38435 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38436 return false;
38437 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38438 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38439 continue;
38440 const APInt &Val = V.getConstantOperandAPInt(i);
38441 if (Val.getBitWidth() > Val.getNumSignBits() &&
38442 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38443 return true;
38444 }
38445 return false;
38446 };
38447 // For vectors - if we have a constant, then try to sign extend.
38448 // TODO: Handle AND cases.
38449 unsigned ActiveBits = DemandedBits.getActiveBits();
38450 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38451 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38452 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38453 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38454 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38456 SDValue NewC =
38458 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38459 SDValue NewOp =
38460 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38461 return TLO.CombineTo(Op, NewOp);
38462 }
38463 return false;
38464 }
38465
38466 // Only optimize Ands to prevent shrinking a constant that could be
38467 // matched by movzx.
38468 if (Opcode != ISD::AND)
38469 return false;
38470
38471 // Make sure the RHS really is a constant.
38472 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38473 if (!C)
38474 return false;
38475
38476 const APInt &Mask = C->getAPIntValue();
38477
38478 // Clear all non-demanded bits initially.
38479 APInt ShrunkMask = Mask & DemandedBits;
38480
38481 // Find the width of the shrunk mask.
38482 unsigned Width = ShrunkMask.getActiveBits();
38483
38484 // If the mask is all 0s there's nothing to do here.
38485 if (Width == 0)
38486 return false;
38487
38488 // Find the next power of 2 width, rounding up to a byte.
38489 Width = llvm::bit_ceil(std::max(Width, 8U));
38490 // Truncate the width to size to handle illegal types.
38491 Width = std::min(Width, EltSize);
38492
38493 // Calculate a possible zero extend mask for this constant.
38494 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38495
38496 // If we aren't changing the mask, just return true to keep it and prevent
38497 // the caller from optimizing.
38498 if (ZeroExtendMask == Mask)
38499 return true;
38500
38501 // Make sure the new mask can be represented by a combination of mask bits
38502 // and non-demanded bits.
38503 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38504 return false;
38505
38506 // Replace the constant with the zero extend mask.
38507 SDLoc DL(Op);
38508 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38509 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38510 return TLO.CombineTo(Op, NewOp);
38511}
38512
38514 KnownBits &Known,
38515 const APInt &DemandedElts,
38516 const SelectionDAG &DAG, unsigned Depth) {
38517 KnownBits Known2;
38518 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38519 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38520 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38521 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38522 Known = KnownBits::abdu(Known, Known2).zext(16);
38523 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38526 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38527 Known = Known.zext(64);
38528}
38529
38531 KnownBits &Known,
38532 const APInt &DemandedElts,
38533 const SelectionDAG &DAG,
38534 unsigned Depth) {
38535 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38536
38537 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38538 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38539 APInt DemandedLoElts =
38540 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38541 APInt DemandedHiElts =
38542 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38543 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38544 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38545 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38546 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38547 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38548 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38549 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38550}
38551
38553 KnownBits &Known,
38554 const APInt &DemandedElts,
38555 const SelectionDAG &DAG,
38556 unsigned Depth) {
38557 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38558
38559 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38560 // pairs.
38561 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38562 APInt DemandedLoElts =
38563 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38564 APInt DemandedHiElts =
38565 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38566 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38567 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38568 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38569 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38570 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38571 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38572 Known = KnownBits::sadd_sat(Lo, Hi);
38573}
38574
38576 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38577 const SelectionDAG &DAG,
38578 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38579 KnownBitsFunc) {
38580 APInt DemandedEltsLHS, DemandedEltsRHS;
38581 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38582 DemandedElts, DemandedEltsLHS,
38583 DemandedEltsRHS);
38584
38585 const auto ComputeForSingleOpFunc =
38586 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38587 return KnownBitsFunc(
38588 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38589 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38590 };
38591
38592 if (DemandedEltsRHS.isZero())
38593 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38594 if (DemandedEltsLHS.isZero())
38595 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38596
38597 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38598 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38599}
38600
38602 KnownBits &Known,
38603 const APInt &DemandedElts,
38604 const SelectionDAG &DAG,
38605 unsigned Depth) const {
38606 unsigned BitWidth = Known.getBitWidth();
38607 unsigned NumElts = DemandedElts.getBitWidth();
38608 unsigned Opc = Op.getOpcode();
38609 EVT VT = Op.getValueType();
38614 "Should use MaskedValueIsZero if you don't know whether Op"
38615 " is a target node!");
38616
38617 Known.resetAll();
38618 switch (Opc) {
38619 default: break;
38620 case X86ISD::MUL_IMM: {
38621 KnownBits Known2;
38622 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38623 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38624 Known = KnownBits::mul(Known, Known2);
38625 break;
38626 }
38627 case X86ISD::BSF: {
38629
38630 KnownBits Known2;
38631 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38632 if (Known2.isNonZero()) {
38633 // If we have a known 1, its position is our upper bound.
38634 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38635 unsigned LowBits = llvm::bit_width(PossibleTZ);
38636 Known.Zero.setBitsFrom(LowBits);
38637 } else if (!Op.getOperand(0).isUndef()) {
38638 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38639 Known = Known.intersectWith(Known2);
38640 }
38641 break;
38642 }
38643 case X86ISD::BSR: {
38644 // TODO: Bound with input known bits?
38646
38647 if (!Op.getOperand(0).isUndef() &&
38648 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38649 KnownBits Known2;
38650 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38651 Known = Known.intersectWith(Known2);
38652 }
38653 break;
38654 }
38655 case X86ISD::SETCC:
38656 Known.Zero.setBitsFrom(1);
38657 break;
38658 case X86ISD::MOVMSK: {
38659 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38660 Known.Zero.setBitsFrom(NumLoBits);
38661 break;
38662 }
38663 case X86ISD::PEXTRB:
38664 case X86ISD::PEXTRW: {
38665 SDValue Src = Op.getOperand(0);
38666 EVT SrcVT = Src.getValueType();
38667 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38668 Op.getConstantOperandVal(1));
38669 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38670 Known = Known.anyextOrTrunc(BitWidth);
38671 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38672 break;
38673 }
38674 case X86ISD::VSRAI:
38675 case X86ISD::VSHLI:
38676 case X86ISD::VSRLI: {
38677 unsigned ShAmt = Op.getConstantOperandVal(1);
38678 if (ShAmt >= VT.getScalarSizeInBits()) {
38679 // Out of range logical bit shifts are guaranteed to be zero.
38680 // Out of range arithmetic bit shifts splat the sign bit.
38681 if (Opc != X86ISD::VSRAI) {
38682 Known.setAllZero();
38683 break;
38684 }
38685
38686 ShAmt = VT.getScalarSizeInBits() - 1;
38687 }
38688
38689 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38690 if (Opc == X86ISD::VSHLI) {
38691 Known <<= ShAmt;
38692 // Low bits are known zero.
38693 Known.Zero.setLowBits(ShAmt);
38694 } else if (Opc == X86ISD::VSRLI) {
38695 Known >>= ShAmt;
38696 // High bits are known zero.
38697 Known.Zero.setHighBits(ShAmt);
38698 } else {
38699 Known.Zero.ashrInPlace(ShAmt);
38700 Known.One.ashrInPlace(ShAmt);
38701 }
38702 break;
38703 }
38704 case X86ISD::PACKUS: {
38705 // PACKUS is just a truncation if the upper half is zero.
38706 APInt DemandedLHS, DemandedRHS;
38707 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38708
38709 Known.One = APInt::getAllOnes(BitWidth * 2);
38710 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38711
38712 KnownBits Known2;
38713 if (!!DemandedLHS) {
38714 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38715 Known = Known.intersectWith(Known2);
38716 }
38717 if (!!DemandedRHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38719 Known = Known.intersectWith(Known2);
38720 }
38721
38722 if (Known.countMinLeadingZeros() < BitWidth)
38723 Known.resetAll();
38724 Known = Known.trunc(BitWidth);
38725 break;
38726 }
38727 case X86ISD::PSHUFB: {
38728 SDValue Src = Op.getOperand(0);
38729 SDValue Idx = Op.getOperand(1);
38730
38731 // If the index vector is never negative (MSB is zero), then all elements
38732 // come from the source vector. This is useful for cases where
38733 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38734 // below will handle the more common constant shuffle mask case.
38735 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38736 if (KnownIdx.isNonNegative())
38737 Known = DAG.computeKnownBits(Src, Depth + 1);
38738 break;
38739 }
38740 case X86ISD::VBROADCAST: {
38741 SDValue Src = Op.getOperand(0);
38742 if (!Src.getSimpleValueType().isVector()) {
38743 Known = DAG.computeKnownBits(Src, Depth + 1);
38744 return;
38745 }
38746 break;
38747 }
38748 case X86ISD::AND: {
38749 if (Op.getResNo() == 0) {
38750 KnownBits Known2;
38751 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38752 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38753 Known &= Known2;
38754 }
38755 break;
38756 }
38757 case X86ISD::ANDNP: {
38758 KnownBits Known2;
38759 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38760 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38761
38762 // ANDNP = (~X & Y);
38763 Known.One &= Known2.Zero;
38764 Known.Zero |= Known2.One;
38765 break;
38766 }
38767 case X86ISD::FOR: {
38768 KnownBits Known2;
38769 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38770 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38771
38772 Known |= Known2;
38773 break;
38774 }
38775 case X86ISD::PSADBW: {
38776 SDValue LHS = Op.getOperand(0);
38777 SDValue RHS = Op.getOperand(1);
38778 assert(VT.getScalarType() == MVT::i64 &&
38779 LHS.getValueType() == RHS.getValueType() &&
38780 LHS.getValueType().getScalarType() == MVT::i8 &&
38781 "Unexpected PSADBW types");
38782 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38783 break;
38784 }
38785 case X86ISD::PCMPGT:
38786 case X86ISD::PCMPEQ: {
38787 KnownBits KnownLhs =
38788 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38789 KnownBits KnownRhs =
38790 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38791 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38792 ? KnownBits::eq(KnownLhs, KnownRhs)
38793 : KnownBits::sgt(KnownLhs, KnownRhs);
38794 if (Res) {
38795 if (*Res)
38796 Known.setAllOnes();
38797 else
38798 Known.setAllZero();
38799 }
38800 break;
38801 }
38802 case X86ISD::VPMADDWD: {
38803 SDValue LHS = Op.getOperand(0);
38804 SDValue RHS = Op.getOperand(1);
38805 assert(VT.getVectorElementType() == MVT::i32 &&
38806 LHS.getValueType() == RHS.getValueType() &&
38807 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38808 "Unexpected PMADDWD types");
38809 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38810 break;
38811 }
38812 case X86ISD::VPMADDUBSW: {
38813 SDValue LHS = Op.getOperand(0);
38814 SDValue RHS = Op.getOperand(1);
38815 assert(VT.getVectorElementType() == MVT::i16 &&
38816 LHS.getValueType() == RHS.getValueType() &&
38817 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38818 "Unexpected PMADDUBSW types");
38819 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38820 break;
38821 }
38822 case X86ISD::PMULUDQ: {
38823 KnownBits Known2;
38824 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38825 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38826
38827 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38828 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38829 Known = KnownBits::mul(Known, Known2);
38830 break;
38831 }
38832 case X86ISD::CMOV: {
38833 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38834 // If we don't know any bits, early out.
38835 if (Known.isUnknown())
38836 break;
38837 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38838
38839 // Only known if known in both the LHS and RHS.
38840 Known = Known.intersectWith(Known2);
38841 break;
38842 }
38843 case X86ISD::BEXTR:
38844 case X86ISD::BEXTRI: {
38845 SDValue Op0 = Op.getOperand(0);
38846 SDValue Op1 = Op.getOperand(1);
38847
38848 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38849 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38850 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38851
38852 // If the length is 0, the result is 0.
38853 if (Length == 0) {
38854 Known.setAllZero();
38855 break;
38856 }
38857
38858 if ((Shift + Length) <= BitWidth) {
38859 Known = DAG.computeKnownBits(Op0, Depth + 1);
38860 Known = Known.extractBits(Length, Shift);
38861 Known = Known.zextOrTrunc(BitWidth);
38862 }
38863 }
38864 break;
38865 }
38866 case X86ISD::PDEP: {
38867 KnownBits Known2;
38868 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38869 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38870 // Zeros are retained from the mask operand. But not ones.
38871 Known.One.clearAllBits();
38872 // The result will have at least as many trailing zeros as the non-mask
38873 // operand since bits can only map to the same or higher bit position.
38874 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38875 break;
38876 }
38877 case X86ISD::PEXT: {
38878 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38879 // The result has as many leading zeros as the number of zeroes in the mask.
38880 unsigned Count = Known.Zero.popcount();
38882 Known.One.clearAllBits();
38883 break;
38884 }
38885 case X86ISD::VTRUNC:
38886 case X86ISD::VTRUNCS:
38887 case X86ISD::VTRUNCUS:
38888 case X86ISD::CVTSI2P:
38889 case X86ISD::CVTUI2P:
38890 case X86ISD::CVTP2SI:
38891 case X86ISD::CVTP2UI:
38892 case X86ISD::MCVTP2SI:
38893 case X86ISD::MCVTP2UI:
38894 case X86ISD::CVTTP2SI:
38895 case X86ISD::CVTTP2UI:
38896 case X86ISD::MCVTTP2SI:
38897 case X86ISD::MCVTTP2UI:
38898 case X86ISD::MCVTSI2P:
38899 case X86ISD::MCVTUI2P:
38900 case X86ISD::VFPROUND:
38901 case X86ISD::VMFPROUND:
38902 case X86ISD::CVTPS2PH:
38903 case X86ISD::MCVTPS2PH:
38904 case X86ISD::MCVTTP2SIS:
38905 case X86ISD::MCVTTP2UIS: {
38906 // Truncations/Conversions - upper elements are known zero.
38907 EVT SrcVT = Op.getOperand(0).getValueType();
38908 if (SrcVT.isVector()) {
38909 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38910 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38911 Known.setAllZero();
38912 }
38913 break;
38914 }
38921 // Strict Conversions - upper elements are known zero.
38922 EVT SrcVT = Op.getOperand(1).getValueType();
38923 if (SrcVT.isVector()) {
38924 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38925 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38926 Known.setAllZero();
38927 }
38928 break;
38929 }
38930 case X86ISD::MOVQ2DQ: {
38931 // Move from MMX to XMM. Upper half of XMM should be 0.
38932 if (DemandedElts.countr_zero() >= (NumElts / 2))
38933 Known.setAllZero();
38934 break;
38935 }
38937 APInt UndefElts;
38938 SmallVector<APInt, 16> EltBits;
38939 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38940 /*AllowWholeUndefs*/ false,
38941 /*AllowPartialUndefs*/ false)) {
38942 Known.Zero.setAllBits();
38943 Known.One.setAllBits();
38944 for (unsigned I = 0; I != NumElts; ++I) {
38945 if (!DemandedElts[I])
38946 continue;
38947 if (UndefElts[I]) {
38948 Known.resetAll();
38949 break;
38950 }
38951 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38952 Known = Known.intersectWith(Known2);
38953 }
38954 return;
38955 }
38956 break;
38957 }
38958 case X86ISD::HADD:
38959 case X86ISD::HSUB: {
38961 Op, DemandedElts, Depth, DAG,
38962 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38964 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38965 KnownLHS, KnownRHS);
38966 });
38967 break;
38968 }
38970 switch (Op->getConstantOperandVal(0)) {
38971 case Intrinsic::x86_sse2_pmadd_wd:
38972 case Intrinsic::x86_avx2_pmadd_wd:
38973 case Intrinsic::x86_avx512_pmaddw_d_512: {
38974 SDValue LHS = Op.getOperand(1);
38975 SDValue RHS = Op.getOperand(2);
38976 assert(VT.getScalarType() == MVT::i32 &&
38977 LHS.getValueType() == RHS.getValueType() &&
38978 LHS.getValueType().getScalarType() == MVT::i16 &&
38979 "Unexpected PMADDWD types");
38980 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38981 break;
38982 }
38983 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38984 case Intrinsic::x86_avx2_pmadd_ub_sw:
38985 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38986 SDValue LHS = Op.getOperand(1);
38987 SDValue RHS = Op.getOperand(2);
38988 assert(VT.getScalarType() == MVT::i16 &&
38989 LHS.getValueType() == RHS.getValueType() &&
38990 LHS.getValueType().getScalarType() == MVT::i8 &&
38991 "Unexpected PMADDUBSW types");
38992 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38993 break;
38994 }
38995 case Intrinsic::x86_sse2_psad_bw:
38996 case Intrinsic::x86_avx2_psad_bw:
38997 case Intrinsic::x86_avx512_psad_bw_512: {
38998 SDValue LHS = Op.getOperand(1);
38999 SDValue RHS = Op.getOperand(2);
39000 assert(VT.getScalarType() == MVT::i64 &&
39001 LHS.getValueType() == RHS.getValueType() &&
39002 LHS.getValueType().getScalarType() == MVT::i8 &&
39003 "Unexpected PSADBW types");
39004 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39005 break;
39006 }
39007 }
39008 break;
39009 }
39010 case X86ISD::VPMADD52L:
39011 case X86ISD::VPMADD52H: {
39012 assert(Op.getValueType().isVector() &&
39013 Op.getValueType().getScalarType() == MVT::i64 &&
39014 "Unexpected VPMADD52 type");
39015 KnownBits K0 =
39016 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39017 KnownBits K1 =
39018 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39019 KnownBits KAcc =
39020 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39021 K0 = K0.trunc(52);
39022 K1 = K1.trunc(52);
39023 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39024 ? KnownBits::mul(K0, K1)
39025 : KnownBits::mulhu(K0, K1);
39026 KnownMul = KnownMul.zext(64);
39027 Known = KnownBits::add(KAcc, KnownMul);
39028 return;
39029 }
39030 }
39031
39032 // Handle target shuffles.
39033 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39034 if (isTargetShuffle(Opc)) {
39037 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39038 unsigned NumOps = Ops.size();
39039 unsigned NumElts = VT.getVectorNumElements();
39040 if (Mask.size() == NumElts) {
39041 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39042 Known.Zero.setAllBits(); Known.One.setAllBits();
39043 for (unsigned i = 0; i != NumElts; ++i) {
39044 if (!DemandedElts[i])
39045 continue;
39046 int M = Mask[i];
39047 if (M == SM_SentinelUndef) {
39048 // For UNDEF elements, we don't know anything about the common state
39049 // of the shuffle result.
39050 Known.resetAll();
39051 break;
39052 }
39053 if (M == SM_SentinelZero) {
39054 Known.One.clearAllBits();
39055 continue;
39056 }
39057 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39058 "Shuffle index out of range");
39059
39060 unsigned OpIdx = (unsigned)M / NumElts;
39061 unsigned EltIdx = (unsigned)M % NumElts;
39062 if (Ops[OpIdx].getValueType() != VT) {
39063 // TODO - handle target shuffle ops with different value types.
39064 Known.resetAll();
39065 break;
39066 }
39067 DemandedOps[OpIdx].setBit(EltIdx);
39068 }
39069 // Known bits are the values that are shared by every demanded element.
39070 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39071 if (!DemandedOps[i])
39072 continue;
39073 KnownBits Known2 =
39074 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39075 Known = Known.intersectWith(Known2);
39076 }
39077 }
39078 }
39079 }
39080}
39081
39083 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39084 unsigned Depth) const {
39085 EVT VT = Op.getValueType();
39086 unsigned VTBits = VT.getScalarSizeInBits();
39087 unsigned Opcode = Op.getOpcode();
39088 switch (Opcode) {
39090 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39091 return VTBits;
39092
39093 case X86ISD::VTRUNC: {
39094 SDValue Src = Op.getOperand(0);
39095 MVT SrcVT = Src.getSimpleValueType();
39096 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39097 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39098 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39099 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39100 if (Tmp > (NumSrcBits - VTBits))
39101 return Tmp - (NumSrcBits - VTBits);
39102 return 1;
39103 }
39104
39105 case X86ISD::PACKSS: {
39106 // PACKSS is just a truncation if the sign bits extend to the packed size.
39107 APInt DemandedLHS, DemandedRHS;
39108 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39109 DemandedRHS);
39110
39111 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39112 // patterns often used to compact vXi64 allsignbit patterns.
39113 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39115 if (BC.getOpcode() == X86ISD::PACKSS &&
39116 BC.getScalarValueSizeInBits() == 16 &&
39117 V.getScalarValueSizeInBits() == 32) {
39120 if (BC0.getScalarValueSizeInBits() == 64 &&
39121 BC1.getScalarValueSizeInBits() == 64 &&
39122 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39123 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39124 return 32;
39125 }
39126 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39127 };
39128
39129 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39130 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39131 if (!!DemandedLHS)
39132 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39133 if (!!DemandedRHS)
39134 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39135 unsigned Tmp = std::min(Tmp0, Tmp1);
39136 if (Tmp > (SrcBits - VTBits))
39137 return Tmp - (SrcBits - VTBits);
39138 return 1;
39139 }
39140
39141 case X86ISD::VBROADCAST: {
39142 SDValue Src = Op.getOperand(0);
39143 if (!Src.getSimpleValueType().isVector())
39144 return DAG.ComputeNumSignBits(Src, Depth + 1);
39145 break;
39146 }
39147
39148 case X86ISD::VSHLI: {
39149 SDValue Src = Op.getOperand(0);
39150 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39151 if (ShiftVal.uge(VTBits))
39152 return VTBits; // Shifted all bits out --> zero.
39153 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39154 if (ShiftVal.uge(Tmp))
39155 return 1; // Shifted all sign bits out --> unknown.
39156 return Tmp - ShiftVal.getZExtValue();
39157 }
39158
39159 case X86ISD::VSRAI: {
39160 SDValue Src = Op.getOperand(0);
39161 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39162 if (ShiftVal.uge(VTBits - 1))
39163 return VTBits; // Sign splat.
39164 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39165 ShiftVal += Tmp;
39166 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39167 }
39168
39169 case X86ISD::FSETCC:
39170 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39171 if (VT == MVT::f32 || VT == MVT::f64 ||
39172 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39173 return VTBits;
39174 break;
39175
39176 case X86ISD::PCMPGT:
39177 case X86ISD::PCMPEQ:
39178 case X86ISD::CMPP:
39179 case X86ISD::VPCOM:
39180 case X86ISD::VPCOMU:
39181 // Vector compares return zero/all-bits result values.
39182 return VTBits;
39183
39184 case X86ISD::ANDNP: {
39185 unsigned Tmp0 =
39186 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39187 if (Tmp0 == 1) return 1; // Early out.
39188 unsigned Tmp1 =
39189 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39190 return std::min(Tmp0, Tmp1);
39191 }
39192
39193 case X86ISD::CMOV: {
39194 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39195 if (Tmp0 == 1) return 1; // Early out.
39196 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39197 return std::min(Tmp0, Tmp1);
39198 }
39199 }
39200
39201 // Handle target shuffles.
39202 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39203 if (isTargetShuffle(Opcode)) {
39206 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39207 unsigned NumOps = Ops.size();
39208 unsigned NumElts = VT.getVectorNumElements();
39209 if (Mask.size() == NumElts) {
39210 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39211 for (unsigned i = 0; i != NumElts; ++i) {
39212 if (!DemandedElts[i])
39213 continue;
39214 int M = Mask[i];
39215 if (M == SM_SentinelUndef) {
39216 // For UNDEF elements, we don't know anything about the common state
39217 // of the shuffle result.
39218 return 1;
39219 } else if (M == SM_SentinelZero) {
39220 // Zero = all sign bits.
39221 continue;
39222 }
39223 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39224 "Shuffle index out of range");
39225
39226 unsigned OpIdx = (unsigned)M / NumElts;
39227 unsigned EltIdx = (unsigned)M % NumElts;
39228 if (Ops[OpIdx].getValueType() != VT) {
39229 // TODO - handle target shuffle ops with different value types.
39230 return 1;
39231 }
39232 DemandedOps[OpIdx].setBit(EltIdx);
39233 }
39234 unsigned Tmp0 = VTBits;
39235 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39236 if (!DemandedOps[i])
39237 continue;
39238 unsigned Tmp1 =
39239 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39240 Tmp0 = std::min(Tmp0, Tmp1);
39241 }
39242 return Tmp0;
39243 }
39244 }
39245 }
39246
39247 // Fallback case.
39248 return 1;
39249}
39250
39252 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39253 return N->getOperand(0);
39254 return N;
39255}
39256
39257// Helper to look for a normal load that can be narrowed into a vzload with the
39258// specified VT and memory VT. Returns SDValue() on failure.
39260 SelectionDAG &DAG) {
39261 // Can't if the load is volatile or atomic.
39262 if (!LN->isSimple())
39263 return SDValue();
39264
39265 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39266 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39267 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39268 LN->getPointerInfo(), LN->getBaseAlign(),
39269 LN->getMemOperand()->getFlags());
39270}
39271
39272// Attempt to match a combined shuffle mask against supported unary shuffle
39273// instructions.
39274// TODO: Investigate sharing more of this with shuffle lowering.
39275static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39276 bool AllowFloatDomain, bool AllowIntDomain,
39277 SDValue V1, const SelectionDAG &DAG,
39278 const X86Subtarget &Subtarget, unsigned &Shuffle,
39279 MVT &SrcVT, MVT &DstVT) {
39280 unsigned NumMaskElts = Mask.size();
39281 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39282
39283 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39284 if (Mask[0] == 0 &&
39285 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39286 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39288 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39289 Shuffle = X86ISD::VZEXT_MOVL;
39290 if (MaskEltSize == 16)
39291 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39292 else
39293 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39294 return true;
39295 }
39296 }
39297
39298 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39299 if (AllowIntDomain &&
39300 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39301 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39302 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39303 unsigned MaxScale = 64 / MaskEltSize;
39304 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39305 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39306 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39307 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39308 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39309 continue;
39310 bool MatchAny = true;
39311 bool MatchZero = true;
39312 bool MatchSign = UseSign;
39313 unsigned NumDstElts = NumMaskElts / Scale;
39314 for (unsigned i = 0;
39315 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39316 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39317 MatchAny = MatchSign = MatchZero = false;
39318 break;
39319 }
39320 unsigned Pos = (i * Scale) + 1;
39321 unsigned Len = Scale - 1;
39322 MatchAny &= isUndefInRange(Mask, Pos, Len);
39323 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39324 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39325 }
39326 if (MatchAny || MatchSign || MatchZero) {
39327 assert((MatchSign || MatchZero) &&
39328 "Failed to match sext/zext but matched aext?");
39329 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39330 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39331 : MVT::getIntegerVT(MaskEltSize);
39332 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39333
39334 Shuffle = unsigned(
39335 MatchAny ? ISD::ANY_EXTEND
39336 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39337 if (SrcVT.getVectorNumElements() != NumDstElts)
39338 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39339
39340 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39341 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39342 return true;
39343 }
39344 }
39345 }
39346
39347 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39348 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39349 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39350 isUndefOrEqual(Mask[0], 0) &&
39351 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39352 Shuffle = X86ISD::VZEXT_MOVL;
39353 if (MaskEltSize == 16)
39354 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39355 else
39356 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39357 return true;
39358 }
39359
39360 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39361 // instructions are no slower than UNPCKLPD but has the option to
39362 // fold the input operand into even an unaligned memory load.
39363 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39364 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39365 Shuffle = X86ISD::MOVDDUP;
39366 SrcVT = DstVT = MVT::v2f64;
39367 return true;
39368 }
39369 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39370 Shuffle = X86ISD::MOVSLDUP;
39371 SrcVT = DstVT = MVT::v4f32;
39372 return true;
39373 }
39374 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39375 Shuffle = X86ISD::MOVSHDUP;
39376 SrcVT = DstVT = MVT::v4f32;
39377 return true;
39378 }
39379 }
39380
39381 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39382 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39383 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39384 Shuffle = X86ISD::MOVDDUP;
39385 SrcVT = DstVT = MVT::v4f64;
39386 return true;
39387 }
39388 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39389 V1)) {
39390 Shuffle = X86ISD::MOVSLDUP;
39391 SrcVT = DstVT = MVT::v8f32;
39392 return true;
39393 }
39394 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39395 V1)) {
39396 Shuffle = X86ISD::MOVSHDUP;
39397 SrcVT = DstVT = MVT::v8f32;
39398 return true;
39399 }
39400 }
39401
39402 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39403 assert(Subtarget.hasAVX512() &&
39404 "AVX512 required for 512-bit vector shuffles");
39405 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39406 V1)) {
39407 Shuffle = X86ISD::MOVDDUP;
39408 SrcVT = DstVT = MVT::v8f64;
39409 return true;
39410 }
39412 MaskVT, Mask,
39413 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39414 Shuffle = X86ISD::MOVSLDUP;
39415 SrcVT = DstVT = MVT::v16f32;
39416 return true;
39417 }
39419 MaskVT, Mask,
39420 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39421 Shuffle = X86ISD::MOVSHDUP;
39422 SrcVT = DstVT = MVT::v16f32;
39423 return true;
39424 }
39425 }
39426
39427 return false;
39428}
39429
39430// Attempt to match a combined shuffle mask against supported unary immediate
39431// permute instructions.
39432// TODO: Investigate sharing more of this with shuffle lowering.
39434 const APInt &Zeroable,
39435 bool AllowFloatDomain, bool AllowIntDomain,
39436 const SelectionDAG &DAG,
39437 const X86Subtarget &Subtarget,
39438 unsigned &Shuffle, MVT &ShuffleVT,
39439 unsigned &PermuteImm) {
39440 unsigned NumMaskElts = Mask.size();
39441 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39442 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39443 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39444 bool ContainsZeros = isAnyZero(Mask);
39445
39446 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39447 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39448 // Check for lane crossing permutes.
39449 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39450 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39451 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39452 Shuffle = X86ISD::VPERMI;
39453 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39454 PermuteImm = getV4X86ShuffleImm(Mask);
39455 return true;
39456 }
39457 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39458 SmallVector<int, 4> RepeatedMask;
39459 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39460 Shuffle = X86ISD::VPERMI;
39461 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39462 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39463 return true;
39464 }
39465 }
39466 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39467 // VPERMILPD can permute with a non-repeating shuffle.
39468 Shuffle = X86ISD::VPERMILPI;
39469 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39470 PermuteImm = 0;
39471 for (int i = 0, e = Mask.size(); i != e; ++i) {
39472 int M = Mask[i];
39473 if (M == SM_SentinelUndef)
39474 continue;
39475 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39476 PermuteImm |= (M & 1) << i;
39477 }
39478 return true;
39479 }
39480 }
39481
39482 // We are checking for shuffle match or shift match. Loop twice so we can
39483 // order which we try and match first depending on target preference.
39484 for (unsigned Order = 0; Order < 2; ++Order) {
39485 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39486 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39487 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39488 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39489 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39490 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39491 SmallVector<int, 4> RepeatedMask;
39492 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39493 // Narrow the repeated mask to create 32-bit element permutes.
39494 SmallVector<int, 4> WordMask = RepeatedMask;
39495 if (MaskScalarSizeInBits == 64)
39496 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39497
39498 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39499 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39500 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39501 PermuteImm = getV4X86ShuffleImm(WordMask);
39502 return true;
39503 }
39504 }
39505
39506 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39507 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39508 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39509 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39510 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39511 SmallVector<int, 4> RepeatedMask;
39512 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39513 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39514 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39515
39516 // PSHUFLW: permute lower 4 elements only.
39517 if (isUndefOrInRange(LoMask, 0, 4) &&
39518 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39519 Shuffle = X86ISD::PSHUFLW;
39520 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39521 PermuteImm = getV4X86ShuffleImm(LoMask);
39522 return true;
39523 }
39524
39525 // PSHUFHW: permute upper 4 elements only.
39526 if (isUndefOrInRange(HiMask, 4, 8) &&
39527 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39528 // Offset the HiMask so that we can create the shuffle immediate.
39529 int OffsetHiMask[4];
39530 for (int i = 0; i != 4; ++i)
39531 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39532
39533 Shuffle = X86ISD::PSHUFHW;
39534 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39535 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39536 return true;
39537 }
39538 }
39539 }
39540 } else {
39541 // Attempt to match against bit rotates.
39542 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39543 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39544 Subtarget.hasAVX512())) {
39545 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39546 Subtarget, Mask);
39547 if (0 < RotateAmt) {
39548 Shuffle = X86ISD::VROTLI;
39549 PermuteImm = (unsigned)RotateAmt;
39550 return true;
39551 }
39552 }
39553 }
39554 // Attempt to match against byte/bit shifts.
39555 if (AllowIntDomain &&
39556 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39557 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39558 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39559 int ShiftAmt =
39560 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39561 Zeroable, Subtarget);
39562 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39563 32 <= ShuffleVT.getScalarSizeInBits())) {
39564 // Byte shifts can be slower so only match them on second attempt.
39565 if (Order == 0 &&
39566 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39567 continue;
39568
39569 PermuteImm = (unsigned)ShiftAmt;
39570 return true;
39571 }
39572
39573 }
39574 }
39575
39576 return false;
39577}
39578
39579// Attempt to match a combined unary shuffle mask against supported binary
39580// shuffle instructions.
39581// TODO: Investigate sharing more of this with shuffle lowering.
39582static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39583 bool AllowFloatDomain, bool AllowIntDomain,
39584 SDValue &V1, SDValue &V2, const SDLoc &DL,
39585 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39586 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39587 bool IsUnary) {
39588 unsigned NumMaskElts = Mask.size();
39589 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39590 unsigned SizeInBits = MaskVT.getSizeInBits();
39591
39592 if (MaskVT.is128BitVector()) {
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39594 AllowFloatDomain) {
39595 V2 = V1;
39596 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39597 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39598 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39599 return true;
39600 }
39601 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39602 AllowFloatDomain) {
39603 V2 = V1;
39604 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39605 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39606 return true;
39607 }
39608 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39609 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39610 std::swap(V1, V2);
39611 Shuffle = X86ISD::MOVSD;
39612 SrcVT = DstVT = MVT::v2f64;
39613 return true;
39614 }
39615 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39616 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39617 Shuffle = X86ISD::MOVSS;
39618 SrcVT = DstVT = MVT::v4f32;
39619 return true;
39620 }
39621 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39622 DAG) &&
39623 Subtarget.hasFP16()) {
39624 Shuffle = X86ISD::MOVSH;
39625 SrcVT = DstVT = MVT::v8f16;
39626 return true;
39627 }
39628 }
39629
39630 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39631 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39632 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39633 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39634 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39635 Subtarget)) {
39636 DstVT = MaskVT;
39637 return true;
39638 }
39639 }
39640 // TODO: Can we handle this inside matchShuffleWithPACK?
39641 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39642 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39643 V1.getScalarValueSizeInBits() == 64 &&
39644 V2.getScalarValueSizeInBits() == 64) {
39645 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39646 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39647 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39648 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39649 SrcVT = MVT::v4i32;
39650 DstVT = MVT::v8i16;
39651 Shuffle = X86ISD::PACKUS;
39652 return true;
39653 }
39654 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39655 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39656 SrcVT = MVT::v8i16;
39657 DstVT = MVT::v16i8;
39658 Shuffle = X86ISD::PACKUS;
39659 return true;
39660 }
39661 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39662 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39663 SrcVT = MVT::v4i32;
39664 DstVT = MVT::v8i16;
39665 Shuffle = X86ISD::PACKSS;
39666 return true;
39667 }
39668 }
39669
39670 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39671 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39672 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39673 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39674 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39675 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39676 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39677 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39678 Subtarget)) {
39679 SrcVT = DstVT = MaskVT;
39680 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39681 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39682 return true;
39683 }
39684 }
39685
39686 // Attempt to match against a OR if we're performing a blend shuffle and the
39687 // non-blended source element is zero in each case.
39688 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39689 if (SizeInBits == V1.getValueSizeInBits() &&
39690 SizeInBits == V2.getValueSizeInBits() &&
39691 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39692 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39693 bool IsBlend = true;
39694 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39695 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39696 unsigned Scale1 = NumV1Elts / NumMaskElts;
39697 unsigned Scale2 = NumV2Elts / NumMaskElts;
39698 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39699 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39700 for (unsigned i = 0; i != NumMaskElts; ++i) {
39701 int M = Mask[i];
39702 if (M == SM_SentinelUndef)
39703 continue;
39704 if (M == SM_SentinelZero) {
39705 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39706 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39707 continue;
39708 }
39709 if (M == (int)i) {
39710 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39711 continue;
39712 }
39713 if (M == (int)(i + NumMaskElts)) {
39714 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39715 continue;
39716 }
39717 IsBlend = false;
39718 break;
39719 }
39720 if (IsBlend) {
39721 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39722 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39723 Shuffle = ISD::OR;
39724 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39725 return true;
39726 }
39727 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39728 // FIXME: handle mismatched sizes?
39729 // TODO: investigate if `ISD::OR` handling in
39730 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39731 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39732 unsigned NumElts = V.getValueType().getVectorNumElements();
39733 KnownBits Known(NumElts);
39734 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39735 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39736 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39737 if (PeepholeKnown.isZero())
39738 Known.Zero.setBit(EltIdx);
39739 if (PeepholeKnown.isAllOnes())
39740 Known.One.setBit(EltIdx);
39741 }
39742 return Known;
39743 };
39744
39745 KnownBits V1Known = computeKnownBitsElementWise(V1);
39746 KnownBits V2Known = computeKnownBitsElementWise(V2);
39747
39748 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39749 int M = Mask[i];
39750 if (M == SM_SentinelUndef)
39751 continue;
39752 if (M == SM_SentinelZero) {
39753 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39754 continue;
39755 }
39756 if (M == (int)i) {
39757 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39758 continue;
39759 }
39760 if (M == (int)(i + NumMaskElts)) {
39761 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39762 continue;
39763 }
39764 llvm_unreachable("will not get here.");
39765 }
39766 if (IsBlend) {
39767 Shuffle = ISD::OR;
39768 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39769 return true;
39770 }
39771 }
39772 }
39773 }
39774
39775 return false;
39776}
39777
39779 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39780 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39781 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39782 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39783 unsigned NumMaskElts = Mask.size();
39784 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39785
39786 // Attempt to match against VALIGND/VALIGNQ rotate.
39787 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39788 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39789 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39790 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39791 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39792 MaskVT.getSizeInBits() / EltSizeInBits);
39793 if (!isAnyZero(Mask)) {
39794 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39795 if (0 < Rotation) {
39796 Shuffle = X86ISD::VALIGN;
39797 ShuffleVT = AlignVT;
39798 PermuteImm = Rotation;
39799 return true;
39800 }
39801 }
39802 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39803 unsigned ZeroLo = Zeroable.countr_one();
39804 unsigned ZeroHi = Zeroable.countl_one();
39805 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39806 if (ZeroLo) {
39807 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39808 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39809 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39810 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39811 Shuffle = X86ISD::VALIGN;
39812 ShuffleVT = AlignVT;
39813 PermuteImm = NumMaskElts - ZeroLo;
39814 return true;
39815 }
39816 }
39817 if (ZeroHi) {
39818 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39819 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39820 ZeroHi);
39821 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39822 V2 = V1;
39823 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39824 Shuffle = X86ISD::VALIGN;
39825 ShuffleVT = AlignVT;
39826 PermuteImm = ZeroHi;
39827 return true;
39828 }
39829 }
39830 }
39831
39832 // Attempt to match against PALIGNR byte rotate.
39833 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39834 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39835 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39836 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39837 if (0 < ByteRotation) {
39838 Shuffle = X86ISD::PALIGNR;
39839 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39840 PermuteImm = ByteRotation;
39841 return true;
39842 }
39843 }
39844
39845 // Attempt to combine to X86ISD::BLENDI.
39846 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39847 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39848 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39849 uint64_t BlendMask = 0;
39850 bool ForceV1Zero = false, ForceV2Zero = false;
39851 SmallVector<int, 8> TargetMask(Mask);
39852 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39853 ForceV2Zero, BlendMask)) {
39854 if (MaskVT == MVT::v16i16) {
39855 // We can only use v16i16 PBLENDW if the lanes are repeated.
39856 SmallVector<int, 8> RepeatedMask;
39857 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39858 RepeatedMask)) {
39859 assert(RepeatedMask.size() == 8 &&
39860 "Repeated mask size doesn't match!");
39861 PermuteImm = 0;
39862 for (int i = 0; i < 8; ++i)
39863 if (RepeatedMask[i] >= 8)
39864 PermuteImm |= 1 << i;
39865 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39866 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39867 Shuffle = X86ISD::BLENDI;
39868 ShuffleVT = MaskVT;
39869 return true;
39870 }
39871 } else {
39872 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39873 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39874 PermuteImm = (unsigned)BlendMask;
39875 Shuffle = X86ISD::BLENDI;
39876 ShuffleVT = MaskVT;
39877 return true;
39878 }
39879 }
39880 }
39881
39882 // Attempt to combine to INSERTPS, but only if it has elements that need to
39883 // be set to zero.
39884 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39885 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39886 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39887 Shuffle = X86ISD::INSERTPS;
39888 ShuffleVT = MVT::v4f32;
39889 return true;
39890 }
39891
39892 // Attempt to combine to SHUFPD.
39893 if (AllowFloatDomain && EltSizeInBits == 64 &&
39894 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39895 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39896 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39897 bool ForceV1Zero = false, ForceV2Zero = false;
39898 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39899 PermuteImm, Mask, Zeroable)) {
39900 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39901 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39902 Shuffle = X86ISD::SHUFP;
39903 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39904 return true;
39905 }
39906 }
39907
39908 // Attempt to combine to SHUFPS.
39909 if (AllowFloatDomain && EltSizeInBits == 32 &&
39910 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39911 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39912 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39913 SmallVector<int, 4> RepeatedMask;
39914 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39915 // Match each half of the repeated mask, to determine if its just
39916 // referencing one of the vectors, is zeroable or entirely undef.
39917 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39918 int M0 = RepeatedMask[Offset];
39919 int M1 = RepeatedMask[Offset + 1];
39920
39921 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39922 return DAG.getUNDEF(MaskVT);
39923 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39924 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39925 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39926 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39927 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39928 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39929 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39930 return V1;
39931 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39932 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39933 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39934 return V2;
39935 }
39936
39937 return SDValue();
39938 };
39939
39940 int ShufMask[4] = {-1, -1, -1, -1};
39941 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39942 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39943
39944 if (Lo && Hi) {
39945 V1 = Lo;
39946 V2 = Hi;
39947 Shuffle = X86ISD::SHUFP;
39948 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39949 PermuteImm = getV4X86ShuffleImm(ShufMask);
39950 return true;
39951 }
39952 }
39953 }
39954
39955 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39956 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39957 MaskVT.is128BitVector() &&
39958 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39959 Shuffle = X86ISD::INSERTPS;
39960 ShuffleVT = MVT::v4f32;
39961 return true;
39962 }
39963
39964 return false;
39965}
39966
39968 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39969 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39970 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39971 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39972 const X86Subtarget &Subtarget);
39973
39974/// Combine an arbitrary chain of shuffles into a single instruction if
39975/// possible.
39976///
39977/// This is the leaf of the recursive combine below. When we have found some
39978/// chain of single-use x86 shuffle instructions and accumulated the combined
39979/// shuffle mask represented by them, this will try to pattern match that mask
39980/// into either a single instruction if there is a special purpose instruction
39981/// for this operation, or into a PSHUFB instruction which is a fully general
39982/// instruction but should only be used to replace chains over a certain depth.
39984 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39985 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39986 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39987 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39988 const X86Subtarget &Subtarget) {
39989 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39990 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39991 "Unexpected number of shuffle inputs!");
39992 unsigned RootSizeInBits = RootVT.getSizeInBits();
39993 unsigned NumRootElts = RootVT.getVectorNumElements();
39994
39995 // Canonicalize shuffle input op to the requested type.
39996 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39997 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39998 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39999 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40000 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40001 return DAG.getBitcast(VT, Op);
40002 };
40003
40004 // Find the inputs that enter the chain. Note that multiple uses are OK
40005 // here, we're not going to remove the operands we find.
40006 bool UnaryShuffle = (Inputs.size() == 1);
40007 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40008 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40009 : peekThroughBitcasts(Inputs[1]));
40010
40011 MVT VT1 = V1.getSimpleValueType();
40012 MVT VT2 = V2.getSimpleValueType();
40013 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40014 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40015
40016 SDValue Res;
40017
40018 unsigned NumBaseMaskElts = BaseMask.size();
40019 if (NumBaseMaskElts == 1) {
40020 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40021 return CanonicalizeShuffleInput(RootVT, V1);
40022 }
40023
40024 bool OptForSize = DAG.shouldOptForSize();
40025 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40026 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40027 (RootVT.isFloatingPoint() && Depth >= 1) ||
40028 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40029
40030 // If we are shuffling a splat (and not introducing zeros) then we can just
40031 // use it directly. This works for smaller elements as well as they already
40032 // repeat across each mask element.
40033 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40034 V1.getValueSizeInBits() >= RootSizeInBits &&
40035 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40036 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40037 return CanonicalizeShuffleInput(RootVT, V1);
40038 }
40039
40040 SmallVector<int, 64> Mask(BaseMask);
40041
40042 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40043 // etc. can be simplified.
40044 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40045 SmallVector<int> ScaledMask, IdentityMask;
40046 unsigned NumElts = VT1.getVectorNumElements();
40047 if (Mask.size() <= NumElts &&
40048 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40049 for (unsigned i = 0; i != NumElts; ++i)
40050 IdentityMask.push_back(i);
40051 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40052 V2))
40053 return CanonicalizeShuffleInput(RootVT, V1);
40054 }
40055 }
40056
40057 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40058 if (RootVT.is512BitVector() &&
40059 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40060 // If the upper subvectors are zeroable, then an extract+insert is more
40061 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40062 // to zero the upper subvectors.
40063 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40064 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40065 return SDValue(); // Nothing to do!
40066 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40067 "Unexpected lane shuffle");
40068 Res = CanonicalizeShuffleInput(RootVT, V1);
40069 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40070 bool UseZero = isAnyZero(Mask);
40071 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40072 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40073 }
40074
40075 // Narrow shuffle mask to v4x128.
40076 SmallVector<int, 4> ScaledMask;
40077 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40078 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40079
40080 // Try to lower to vshuf64x2/vshuf32x4.
40081 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40082 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40083 SelectionDAG &DAG) {
40084 int PermMask[4] = {-1, -1, -1, -1};
40085 // Ensure elements came from the same Op.
40086 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40087 for (int i = 0; i < 4; ++i) {
40088 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40089 if (ScaledMask[i] < 0)
40090 continue;
40091
40092 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40093 unsigned OpIndex = i / 2;
40094 if (Ops[OpIndex].isUndef())
40095 Ops[OpIndex] = Op;
40096 else if (Ops[OpIndex] != Op)
40097 return SDValue();
40098
40099 PermMask[i] = ScaledMask[i] % 4;
40100 }
40101
40102 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40103 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40104 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40105 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40106 };
40107
40108 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40109 // doesn't work because our mask is for 128 bits and we don't have an MVT
40110 // to match that.
40111 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40112 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40113 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40114 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40115 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40116 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40117 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40118 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40119 ScaledMask[1] == (ScaledMask[3] % 2));
40120
40121 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40122 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40123 return SDValue(); // Nothing to do!
40124 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40125 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40126 return DAG.getBitcast(RootVT, V);
40127 }
40128 }
40129
40130 // Handle 128-bit lane shuffles of 256-bit vectors.
40131 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40132 // If the upper half is zeroable, then an extract+insert is more optimal
40133 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40134 // zero the upper half.
40135 if (isUndefOrZero(Mask[1])) {
40136 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40137 return SDValue(); // Nothing to do!
40138 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40139 Res = CanonicalizeShuffleInput(RootVT, V1);
40140 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40141 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40142 256);
40143 }
40144
40145 // If we're inserting the low subvector, an insert-subvector 'concat'
40146 // pattern is quicker than VPERM2X128.
40147 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40148 !Subtarget.hasAVX2()) {
40149 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40150 return SDValue(); // Nothing to do!
40151 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40152 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40153 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40154 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40155 }
40156
40157 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40158 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40159 // feature.
40160 // Prefer blends for sequential shuffles unless we are optimizing for size.
40161 if (UnaryShuffle &&
40162 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40163 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40164 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40165 return SDValue(); // Nothing to do!
40166 unsigned PermMask = 0;
40167 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40168 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40169 return DAG.getNode(
40170 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40171 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40172 }
40173
40174 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40175 return SDValue(); // Nothing to do!
40176
40177 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40178 if (!UnaryShuffle && !IsMaskedShuffle) {
40179 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40180 "Unexpected shuffle sentinel value");
40181 // Prefer blends to X86ISD::VPERM2X128.
40182 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40183 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40184 return SDValue(); // Nothing to do!
40185 unsigned PermMask = 0;
40186 PermMask |= ((Mask[0] & 3) << 0);
40187 PermMask |= ((Mask[1] & 3) << 4);
40188 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40189 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40190 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40191 CanonicalizeShuffleInput(RootVT, LHS),
40192 CanonicalizeShuffleInput(RootVT, RHS),
40193 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40194 }
40195 }
40196 }
40197
40198 // For masks that have been widened to 128-bit elements or more,
40199 // narrow back down to 64-bit elements.
40200 if (BaseMaskEltSizeInBits > 64) {
40201 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40202 int MaskScale = BaseMaskEltSizeInBits / 64;
40203 SmallVector<int, 64> ScaledMask;
40204 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40205 Mask = std::move(ScaledMask);
40206 }
40207
40208 // For masked shuffles, we're trying to match the root width for better
40209 // writemask folding, attempt to scale the mask.
40210 // TODO - variable shuffles might need this to be widened again.
40211 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40212 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40213 int MaskScale = NumRootElts / Mask.size();
40214 SmallVector<int, 64> ScaledMask;
40215 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40216 Mask = std::move(ScaledMask);
40217 }
40218
40219 unsigned NumMaskElts = Mask.size();
40220 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40221 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40222
40223 // Determine the effective mask value type.
40224 FloatDomain &= (32 <= MaskEltSizeInBits);
40225 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40226 : MVT::getIntegerVT(MaskEltSizeInBits);
40227 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40228
40229 // Only allow legal mask types.
40230 if (!TLI.isTypeLegal(MaskVT))
40231 return SDValue();
40232
40233 // Attempt to match the mask against known shuffle patterns.
40234 MVT ShuffleSrcVT, ShuffleVT;
40235 unsigned Shuffle, PermuteImm;
40236
40237 // Which shuffle domains are permitted?
40238 // Permit domain crossing at higher combine depths.
40239 // TODO: Should we indicate which domain is preferred if both are allowed?
40240 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40241 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40242 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40243
40244 // Determine zeroable mask elements.
40245 APInt KnownUndef, KnownZero;
40246 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40247 APInt Zeroable = KnownUndef | KnownZero;
40248
40249 if (UnaryShuffle) {
40250 // Attempt to match against broadcast-from-vector.
40251 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40252 if ((Subtarget.hasAVX2() ||
40253 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40254 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40255 if (isUndefOrEqual(Mask, 0)) {
40256 if (V1.getValueType() == MaskVT &&
40258 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40259 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40260 return SDValue(); // Nothing to do!
40261 Res = V1.getOperand(0);
40262 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40263 return DAG.getBitcast(RootVT, Res);
40264 }
40265 if (Subtarget.hasAVX2()) {
40266 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40267 return SDValue(); // Nothing to do!
40268 Res = CanonicalizeShuffleInput(MaskVT, V1);
40269 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40270 return DAG.getBitcast(RootVT, Res);
40271 }
40272 }
40273 }
40274
40275 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40276 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40277 (!IsMaskedShuffle ||
40278 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40279 if (Depth == 0 && RootOpc == Shuffle)
40280 return SDValue(); // Nothing to do!
40281 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40282 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40283 return DAG.getBitcast(RootVT, Res);
40284 }
40285
40286 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40287 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40288 PermuteImm) &&
40289 (!IsMaskedShuffle ||
40290 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40291 if (Depth == 0 && RootOpc == Shuffle)
40292 return SDValue(); // Nothing to do!
40293 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40294 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40295 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40296 return DAG.getBitcast(RootVT, Res);
40297 }
40298 }
40299
40300 // Attempt to combine to INSERTPS, but only if the inserted element has come
40301 // from a scalar.
40302 // TODO: Handle other insertions here as well?
40303 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40304 Subtarget.hasSSE41() &&
40305 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40306 if (MaskEltSizeInBits == 32) {
40307 SDValue SrcV1 = V1, SrcV2 = V2;
40308 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40309 DAG) &&
40310 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40311 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40312 return SDValue(); // Nothing to do!
40313 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40314 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40315 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40316 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40317 return DAG.getBitcast(RootVT, Res);
40318 }
40319 }
40320 if (MaskEltSizeInBits == 64 &&
40321 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40323 V2.getScalarValueSizeInBits() <= 32) {
40324 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40325 return SDValue(); // Nothing to do!
40326 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40327 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40328 CanonicalizeShuffleInput(MVT::v4f32, V1),
40329 CanonicalizeShuffleInput(MVT::v4f32, V2),
40330 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40331 return DAG.getBitcast(RootVT, Res);
40332 }
40333 }
40334
40335 SDValue NewV1 = V1; // Save operands in case early exit happens.
40336 SDValue NewV2 = V2;
40337 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40338 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40339 ShuffleVT, UnaryShuffle) &&
40340 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40341 if (Depth == 0 && RootOpc == Shuffle)
40342 return SDValue(); // Nothing to do!
40343 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40344 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40345 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40346 return DAG.getBitcast(RootVT, Res);
40347 }
40348
40349 NewV1 = V1; // Save operands in case early exit happens.
40350 NewV2 = V2;
40351 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40352 AllowIntDomain, NewV1, NewV2, DL, DAG,
40353 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40354 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40355 if (Depth == 0 && RootOpc == Shuffle)
40356 return SDValue(); // Nothing to do!
40357 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40358 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40359 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40360 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40361 return DAG.getBitcast(RootVT, Res);
40362 }
40363
40364 // Typically from here on, we need an integer version of MaskVT.
40365 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40366 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40367
40368 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40369 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40370 uint64_t BitLen, BitIdx;
40371 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40372 Zeroable)) {
40373 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40374 return SDValue(); // Nothing to do!
40375 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40376 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40377 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40378 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40379 return DAG.getBitcast(RootVT, Res);
40380 }
40381
40382 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40383 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40384 return SDValue(); // Nothing to do!
40385 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40386 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40387 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40388 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40389 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40390 return DAG.getBitcast(RootVT, Res);
40391 }
40392 }
40393
40394 // Match shuffle against TRUNCATE patterns.
40395 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40396 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40397 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40398 Subtarget)) {
40399 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40400 ShuffleSrcVT.getVectorNumElements();
40401 unsigned Opc =
40403 if (Depth == 0 && RootOpc == Opc)
40404 return SDValue(); // Nothing to do!
40405 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40406 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40407 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40408 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40409 return DAG.getBitcast(RootVT, Res);
40410 }
40411
40412 // Do we need a more general binary truncation pattern?
40413 if (RootSizeInBits < 512 &&
40414 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40415 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40416 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40417 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40418 // Bail if this was already a truncation or PACK node.
40419 // We sometimes fail to match PACK if we demand known undef elements.
40420 if (Depth == 0 &&
40421 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40422 RootOpc == X86ISD::PACKUS))
40423 return SDValue(); // Nothing to do!
40424 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40425 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40426 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40427 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40428 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40429 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40430 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40431 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40432 return DAG.getBitcast(RootVT, Res);
40433 }
40434 }
40435
40436 // Don't try to re-form single instruction chains under any circumstances now
40437 // that we've done encoding canonicalization for them.
40438 if (Depth < 1)
40439 return SDValue();
40440
40441 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40442 return isTargetShuffleVariableMask(N->getOpcode());
40443 });
40444 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40445 return (N->getOpcode() == X86ISD::VPERMV3 ||
40446 N->getOpcode() == X86ISD::VPERMV);
40447 });
40448
40449 // Depth threshold above which we can efficiently use variable mask shuffles.
40450 int VariableCrossLaneShuffleDepth =
40451 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40452 int VariablePerLaneShuffleDepth =
40453 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40454 AllowVariableCrossLaneMask &=
40455 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40456 AllowVariablePerLaneMask &=
40457 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40458 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40459 // higher depth before combining them.
40460 int BWIVPERMV3ShuffleDepth =
40461 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40462 bool AllowBWIVPERMV3 =
40463 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40464
40465 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40466 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40467 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40468
40469 bool MaskContainsZeros = isAnyZero(Mask);
40470
40471 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40472 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40473 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40474 if (Subtarget.hasAVX2() &&
40475 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40476 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40477 Res = CanonicalizeShuffleInput(MaskVT, V1);
40478 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40479 return DAG.getBitcast(RootVT, Res);
40480 }
40481 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40482 if ((Subtarget.hasAVX512() &&
40483 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40484 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40485 (Subtarget.hasBWI() &&
40486 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40487 (Subtarget.hasVBMI() &&
40488 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40489 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40490 V2 = DAG.getUNDEF(MaskVT);
40491 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40492 return DAG.getBitcast(RootVT, Res);
40493 }
40494 }
40495
40496 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40497 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40498 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40499 ((Subtarget.hasAVX512() &&
40500 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40501 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40502 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40503 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40504 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40505 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40506 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40508 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40509 for (unsigned i = 0; i != NumMaskElts; ++i)
40510 if (Mask[i] == SM_SentinelZero)
40511 Mask[i] = NumMaskElts + i;
40512 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40513 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40514 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40515 return DAG.getBitcast(RootVT, Res);
40516 }
40517
40518 // If that failed and either input is extracted then try to combine as a
40519 // shuffle with the larger type.
40521 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40522 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40523 IsMaskedShuffle, DAG, DL, Subtarget))
40524 return WideShuffle;
40525
40526 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40527 // (non-VLX will pad to 512-bit shuffles).
40528 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40529 ((Subtarget.hasAVX512() &&
40530 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40531 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40532 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40533 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40534 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40535 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40536 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40537 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40538 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40539 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40540 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40541 return DAG.getBitcast(RootVT, Res);
40542 }
40543 return SDValue();
40544 }
40545
40546 // See if we can combine a single input shuffle with zeros to a bit-mask,
40547 // which is much simpler than any shuffle.
40548 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40549 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40550 TLI.isTypeLegal(MaskVT)) {
40551 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40552 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40553 APInt UndefElts(NumMaskElts, 0);
40554 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40555 for (unsigned i = 0; i != NumMaskElts; ++i) {
40556 int M = Mask[i];
40557 if (M == SM_SentinelUndef) {
40558 UndefElts.setBit(i);
40559 continue;
40560 }
40561 if (M == SM_SentinelZero)
40562 continue;
40563 EltBits[i] = AllOnes;
40564 }
40565 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40566 Res = CanonicalizeShuffleInput(MaskVT, V1);
40567 unsigned AndOpcode =
40569 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40570 return DAG.getBitcast(RootVT, Res);
40571 }
40572
40573 // If we have a single input shuffle with different shuffle patterns in the
40574 // the 128-bit lanes use the variable mask to VPERMILPS.
40575 // TODO Combine other mask types at higher depths.
40576 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40577 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40578 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40579 SmallVector<SDValue, 16> VPermIdx;
40580 for (int M : Mask) {
40581 SDValue Idx =
40582 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40583 VPermIdx.push_back(Idx);
40584 }
40585 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40586 Res = CanonicalizeShuffleInput(MaskVT, V1);
40587 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40588 return DAG.getBitcast(RootVT, Res);
40589 }
40590
40591 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40592 // to VPERMIL2PD/VPERMIL2PS.
40593 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40594 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40595 MaskVT == MVT::v8f32)) {
40596 // VPERMIL2 Operation.
40597 // Bits[3] - Match Bit.
40598 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40599 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40600 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40601 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40602 SmallVector<int, 8> VPerm2Idx;
40603 unsigned M2ZImm = 0;
40604 for (int M : Mask) {
40605 if (M == SM_SentinelUndef) {
40606 VPerm2Idx.push_back(-1);
40607 continue;
40608 }
40609 if (M == SM_SentinelZero) {
40610 M2ZImm = 2;
40611 VPerm2Idx.push_back(8);
40612 continue;
40613 }
40614 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40615 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40616 VPerm2Idx.push_back(Index);
40617 }
40618 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40619 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40620 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40621 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40622 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40623 return DAG.getBitcast(RootVT, Res);
40624 }
40625
40626 // If we have 3 or more shuffle instructions or a chain involving a variable
40627 // mask, we can replace them with a single PSHUFB instruction profitably.
40628 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40629 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40630 // more aggressive.
40631 if (UnaryShuffle && AllowVariablePerLaneMask &&
40632 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40633 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40634 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40635 SmallVector<SDValue, 16> PSHUFBMask;
40636 int NumBytes = RootVT.getSizeInBits() / 8;
40637 int Ratio = NumBytes / NumMaskElts;
40638 for (int i = 0; i < NumBytes; ++i) {
40639 int M = Mask[i / Ratio];
40640 if (M == SM_SentinelUndef) {
40641 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40642 continue;
40643 }
40644 if (M == SM_SentinelZero) {
40645 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40646 continue;
40647 }
40648 M = Ratio * M + i % Ratio;
40649 assert((M / 16) == (i / 16) && "Lane crossing detected");
40650 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40651 }
40652 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40653 Res = CanonicalizeShuffleInput(ByteVT, V1);
40654 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40655 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40656 return DAG.getBitcast(RootVT, Res);
40657 }
40658
40659 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40660 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40661 // slower than PSHUFB on targets that support both.
40662 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40663 Subtarget.hasXOP()) {
40664 // VPPERM Mask Operation
40665 // Bits[4:0] - Byte Index (0 - 31)
40666 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40667 SmallVector<SDValue, 16> VPPERMMask;
40668 int NumBytes = 16;
40669 int Ratio = NumBytes / NumMaskElts;
40670 for (int i = 0; i < NumBytes; ++i) {
40671 int M = Mask[i / Ratio];
40672 if (M == SM_SentinelUndef) {
40673 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40674 continue;
40675 }
40676 if (M == SM_SentinelZero) {
40677 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40678 continue;
40679 }
40680 M = Ratio * M + i % Ratio;
40681 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40682 }
40683 MVT ByteVT = MVT::v16i8;
40684 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40685 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40686 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40687 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40688 return DAG.getBitcast(RootVT, Res);
40689 }
40690
40691 // If that failed and either input is extracted then try to combine as a
40692 // shuffle with the larger type.
40694 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40695 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40696 DAG, DL, Subtarget))
40697 return WideShuffle;
40698
40699 // If we have a dual input shuffle then lower to VPERMV3,
40700 // (non-VLX will pad to 512-bit shuffles)
40701 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40702 ((Subtarget.hasAVX512() &&
40703 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40704 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40705 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40706 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40707 MaskVT == MVT::v16i32)) ||
40708 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40709 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40710 MaskVT == MVT::v32i16)) ||
40711 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40712 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40713 MaskVT == MVT::v64i8)))) {
40714 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40715 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40716 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40717 return DAG.getBitcast(RootVT, Res);
40718 }
40719
40720 // Failed to find any combines.
40721 return SDValue();
40722}
40723
40724// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40725// instruction if possible.
40726//
40727// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40728// type size to attempt to combine:
40729// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40730// -->
40731// extract_subvector(shuffle(x,y,m2),0)
40733 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40734 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40735 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40736 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40737 const X86Subtarget &Subtarget) {
40738 unsigned NumMaskElts = BaseMask.size();
40739 unsigned NumInputs = Inputs.size();
40740 if (NumInputs == 0)
40741 return SDValue();
40742
40743 unsigned RootSizeInBits = RootVT.getSizeInBits();
40744 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40745 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40746
40747 // Peek through subvectors to find widest legal vector.
40748 // TODO: Handle ISD::TRUNCATE
40749 unsigned WideSizeInBits = RootSizeInBits;
40750 for (SDValue Input : Inputs) {
40752 while (1) {
40753 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40754 Input = peekThroughBitcasts(Input.getOperand(0));
40755 continue;
40756 }
40757 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40758 Input.getOperand(0).isUndef() &&
40759 isNullConstant(Input.getOperand(2))) {
40760 Input = peekThroughBitcasts(Input.getOperand(1));
40761 continue;
40762 }
40763 break;
40764 }
40765 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40766 WideSizeInBits < Input.getValueSizeInBits())
40767 WideSizeInBits = Input.getValueSizeInBits();
40768 }
40769
40770 // Bail if we fail to find a source larger than the existing root.
40771 if (WideSizeInBits <= RootSizeInBits ||
40772 (WideSizeInBits % RootSizeInBits) != 0)
40773 return SDValue();
40774
40775 // Create new mask for larger type.
40776 SmallVector<int, 64> WideMask;
40777 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40778
40779 // Attempt to peek through inputs and adjust mask when we extract from an
40780 // upper subvector.
40781 int AdjustedMasks = 0;
40782 SmallVector<SDValue, 4> WideInputs(Inputs);
40783 for (unsigned I = 0; I != NumInputs; ++I) {
40784 SDValue &Input = WideInputs[I];
40786 while (1) {
40787 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40788 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40789 uint64_t Idx = Input.getConstantOperandVal(1);
40790 if (Idx != 0) {
40791 ++AdjustedMasks;
40792 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40793 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40794
40795 int lo = I * WideMask.size();
40796 int hi = (I + 1) * WideMask.size();
40797 for (int &M : WideMask)
40798 if (lo <= M && M < hi)
40799 M += Idx;
40800 }
40801 Input = peekThroughBitcasts(Input.getOperand(0));
40802 continue;
40803 }
40804 // TODO: Handle insertions into upper subvectors.
40805 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40806 Input.getOperand(0).isUndef() &&
40807 isNullConstant(Input.getOperand(2))) {
40808 Input = peekThroughBitcasts(Input.getOperand(1));
40809 continue;
40810 }
40811 break;
40812 }
40813 }
40814
40815 // Remove unused/repeated shuffle source ops.
40816 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40817 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40818
40819 // Bail if we're always extracting from the lowest subvectors,
40820 // combineX86ShuffleChain should match this for the current width, or the
40821 // shuffle still references too many inputs.
40822 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40823 return SDValue();
40824
40825 // Minor canonicalization of the accumulated shuffle mask to make it easier
40826 // to match below. All this does is detect masks with sequential pairs of
40827 // elements, and shrink them to the half-width mask. It does this in a loop
40828 // so it will reduce the size of the mask to the minimal width mask which
40829 // performs an equivalent shuffle.
40830 while (WideMask.size() > 1) {
40831 SmallVector<int, 64> WidenedMask;
40832 if (!canWidenShuffleElements(WideMask, WidenedMask))
40833 break;
40834 WideMask = std::move(WidenedMask);
40835 }
40836
40837 // Canonicalization of binary shuffle masks to improve pattern matching by
40838 // commuting the inputs.
40839 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40841 std::swap(WideInputs[0], WideInputs[1]);
40842 }
40843
40844 // Increase depth for every upper subvector we've peeked through.
40845 Depth += AdjustedMasks;
40846
40847 // Attempt to combine wider chain.
40848 // TODO: Can we use a better Root?
40849 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40850 WideInputs.back().getValueSizeInBits()
40851 ? WideInputs.front()
40852 : WideInputs.back();
40853 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40854 "WideRootSize mismatch");
40855
40856 if (SDValue WideShuffle = combineX86ShuffleChain(
40857 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40858 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40859 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40860 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40861 return DAG.getBitcast(RootVT, WideShuffle);
40862 }
40863
40864 return SDValue();
40865}
40866
40867// Canonicalize the combined shuffle mask chain with horizontal ops.
40868// NOTE: This may update the Ops and Mask.
40871 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40872 const X86Subtarget &Subtarget) {
40873 if (Mask.empty() || Ops.empty())
40874 return SDValue();
40875
40877 for (SDValue Op : Ops)
40879
40880 // All ops must be the same horizop + type.
40881 SDValue BC0 = BC[0];
40882 EVT VT0 = BC0.getValueType();
40883 unsigned Opcode0 = BC0.getOpcode();
40884 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40885 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40886 }))
40887 return SDValue();
40888
40889 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40890 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40891 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40892 if (!isHoriz && !isPack)
40893 return SDValue();
40894
40895 // Do all ops have a single use?
40896 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40897 return Op.hasOneUse() &&
40899 });
40900
40901 int NumElts = VT0.getVectorNumElements();
40902 int NumLanes = VT0.getSizeInBits() / 128;
40903 int NumEltsPerLane = NumElts / NumLanes;
40904 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40905 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40906 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40907
40908 if (NumEltsPerLane >= 4 &&
40909 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40910 SmallVector<int> LaneMask, ScaledMask;
40911 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40912 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40913 // See if we can remove the shuffle by resorting the HOP chain so that
40914 // the HOP args are pre-shuffled.
40915 // TODO: Generalize to any sized/depth chain.
40916 // TODO: Add support for PACKSS/PACKUS.
40917 if (isHoriz) {
40918 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40919 auto GetHOpSrc = [&](int M) {
40920 if (M == SM_SentinelUndef)
40921 return DAG.getUNDEF(VT0);
40922 if (M == SM_SentinelZero)
40923 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40924 SDValue Src0 = BC[M / 4];
40925 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40926 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40927 return Src1.getOperand(M % 2);
40928 return SDValue();
40929 };
40930 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40931 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40932 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40933 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40934 if (M0 && M1 && M2 && M3) {
40935 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40936 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40937 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40938 }
40939 }
40940 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40941 if (Ops.size() >= 2) {
40942 SDValue LHS, RHS;
40943 auto GetHOpSrc = [&](int M, int &OutM) {
40944 // TODO: Support SM_SentinelZero
40945 if (M < 0)
40946 return M == SM_SentinelUndef;
40947 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40948 if (!LHS || LHS == Src) {
40949 LHS = Src;
40950 OutM = (M % 2);
40951 return true;
40952 }
40953 if (!RHS || RHS == Src) {
40954 RHS = Src;
40955 OutM = (M % 2) + 2;
40956 return true;
40957 }
40958 return false;
40959 };
40960 int PostMask[4] = {-1, -1, -1, -1};
40961 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40962 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40963 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40964 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40965 LHS = DAG.getBitcast(SrcVT, LHS);
40966 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40967 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40968 // Use SHUFPS for the permute so this will work on SSE2 targets,
40969 // shuffle combining and domain handling will simplify this later on.
40970 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40971 Res = DAG.getBitcast(ShuffleVT, Res);
40972 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40973 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40974 }
40975 }
40976 }
40977 }
40978
40979 if (2 < Ops.size())
40980 return SDValue();
40981
40982 SDValue BC1 = BC[BC.size() - 1];
40983 if (Mask.size() == VT0.getVectorNumElements()) {
40984 // Canonicalize binary shuffles of horizontal ops that use the
40985 // same sources to an unary shuffle.
40986 // TODO: Try to perform this fold even if the shuffle remains.
40987 if (Ops.size() == 2) {
40988 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40989 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40990 };
40991 // Commute if all BC0's ops are contained in BC1.
40992 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40993 ContainsOps(BC1, BC0.getOperand(1))) {
40995 std::swap(Ops[0], Ops[1]);
40996 std::swap(BC0, BC1);
40997 }
40998
40999 // If BC1 can be represented by BC0, then convert to unary shuffle.
41000 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41001 ContainsOps(BC0, BC1.getOperand(1))) {
41002 for (int &M : Mask) {
41003 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41004 continue;
41005 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41006 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41007 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41008 M += NumHalfEltsPerLane;
41009 }
41010 }
41011 }
41012
41013 // Canonicalize unary horizontal ops to only refer to lower halves.
41014 for (int i = 0; i != NumElts; ++i) {
41015 int &M = Mask[i];
41016 if (isUndefOrZero(M))
41017 continue;
41018 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41019 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41020 M -= NumHalfEltsPerLane;
41021 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41022 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41023 M -= NumHalfEltsPerLane;
41024 }
41025 }
41026
41027 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41028 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41029 // represents the LHS/RHS inputs for the lower/upper halves.
41030 SmallVector<int, 16> TargetMask128, WideMask128;
41031 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41032 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41033 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41034 bool SingleOp = (Ops.size() == 1);
41035 if (isPack || OneUseOps ||
41036 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41037 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41038 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41039 Lo = Lo.getOperand(WideMask128[0] & 1);
41040 Hi = Hi.getOperand(WideMask128[1] & 1);
41041 if (SingleOp) {
41042 SDValue Undef = DAG.getUNDEF(SrcVT);
41043 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41044 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41045 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41046 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41047 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41048 }
41049 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41050 }
41051 }
41052
41053 // If we are post-shuffling a 256-bit hop and not requiring the upper
41054 // elements, then try to narrow to a 128-bit hop directly.
41055 SmallVector<int, 16> WideMask64;
41056 if (Ops.size() == 1 && NumLanes == 2 &&
41057 scaleShuffleElements(Mask, 4, WideMask64) &&
41058 isUndefInRange(WideMask64, 2, 2)) {
41059 int M0 = WideMask64[0];
41060 int M1 = WideMask64[1];
41061 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41063 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41064 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41065 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41066 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41067 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41068 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41069 }
41070 }
41071
41072 return SDValue();
41073}
41074
41075// Attempt to constant fold all of the constant source ops.
41076// Returns true if the entire shuffle is folded to a constant.
41077// TODO: Extend this to merge multiple constant Ops and update the mask.
41079 ArrayRef<int> Mask,
41080 ArrayRef<const SDNode *> SrcNodes,
41081 SelectionDAG &DAG, const SDLoc &DL,
41082 const X86Subtarget &Subtarget) {
41083 unsigned SizeInBits = VT.getSizeInBits();
41084 unsigned NumMaskElts = Mask.size();
41085 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41086 unsigned NumOps = Ops.size();
41087
41088 // Extract constant bits from each source op.
41089 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41091 for (unsigned I = 0; I != NumOps; ++I)
41092 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41093 RawBitsOps[I],
41094 /*AllowWholeUndefs*/ true,
41095 /*AllowPartialUndefs*/ true))
41096 return SDValue();
41097
41098 // If we're optimizing for size, only fold if at least one of the constants is
41099 // only used once or the combined shuffle has included a variable mask
41100 // shuffle, this is to avoid constant pool bloat.
41101 bool IsOptimizingSize = DAG.shouldOptForSize();
41102 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41103 return isTargetShuffleVariableMask(N->getOpcode());
41104 });
41105 if (IsOptimizingSize && !HasVariableMask &&
41106 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41107 return SDValue();
41108
41109 // Shuffle the constant bits according to the mask.
41110 APInt UndefElts(NumMaskElts, 0);
41111 APInt ZeroElts(NumMaskElts, 0);
41112 APInt ConstantElts(NumMaskElts, 0);
41113 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41114 APInt::getZero(MaskSizeInBits));
41115 for (unsigned i = 0; i != NumMaskElts; ++i) {
41116 int M = Mask[i];
41117 if (M == SM_SentinelUndef) {
41118 UndefElts.setBit(i);
41119 continue;
41120 } else if (M == SM_SentinelZero) {
41121 ZeroElts.setBit(i);
41122 continue;
41123 }
41124 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41125
41126 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41127 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41128
41129 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41130 if (SrcUndefElts[SrcMaskIdx]) {
41131 UndefElts.setBit(i);
41132 continue;
41133 }
41134
41135 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41136 APInt &Bits = SrcEltBits[SrcMaskIdx];
41137 if (!Bits) {
41138 ZeroElts.setBit(i);
41139 continue;
41140 }
41141
41142 ConstantElts.setBit(i);
41143 ConstantBitData[i] = Bits;
41144 }
41145 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41146
41147 // Attempt to create a zero vector.
41148 if ((UndefElts | ZeroElts).isAllOnes())
41149 return getZeroVector(VT, Subtarget, DAG, DL);
41150
41151 // Create the constant data.
41152 MVT MaskSVT;
41153 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41154 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41155 else
41156 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41157
41158 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41159 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41160 return SDValue();
41161
41162 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41163 return DAG.getBitcast(VT, CstOp);
41164}
41165
41166namespace llvm {
41167 namespace X86 {
41168 enum {
41170 };
41171 } // namespace X86
41172} // namespace llvm
41173
41174/// Fully generic combining of x86 shuffle instructions.
41175///
41176/// This should be the last combine run over the x86 shuffle instructions. Once
41177/// they have been fully optimized, this will recursively consider all chains
41178/// of single-use shuffle instructions, build a generic model of the cumulative
41179/// shuffle operation, and check for simpler instructions which implement this
41180/// operation. We use this primarily for two purposes:
41181///
41182/// 1) Collapse generic shuffles to specialized single instructions when
41183/// equivalent. In most cases, this is just an encoding size win, but
41184/// sometimes we will collapse multiple generic shuffles into a single
41185/// special-purpose shuffle.
41186/// 2) Look for sequences of shuffle instructions with 3 or more total
41187/// instructions, and replace them with the slightly more expensive SSSE3
41188/// PSHUFB instruction if available. We do this as the last combining step
41189/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41190/// a suitable short sequence of other instructions. The PSHUFB will either
41191/// use a register or have to read from memory and so is slightly (but only
41192/// slightly) more expensive than the other shuffle instructions.
41193///
41194/// Because this is inherently a quadratic operation (for each shuffle in
41195/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41196/// This should never be an issue in practice as the shuffle lowering doesn't
41197/// produce sequences of more than 8 instructions.
41198///
41199/// FIXME: We will currently miss some cases where the redundant shuffling
41200/// would simplify under the threshold for PSHUFB formation because of
41201/// combine-ordering. To fix this, we should do the redundant instruction
41202/// combining in this recursive walk.
41204 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41205 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41206 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41207 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41208 const SDLoc &DL, const X86Subtarget &Subtarget) {
41209 assert(!RootMask.empty() &&
41210 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41211 "Illegal shuffle root mask");
41212 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41213 unsigned RootSizeInBits = RootVT.getSizeInBits();
41214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41215
41216 // Bound the depth of our recursive combine because this is ultimately
41217 // quadratic in nature.
41218 if (Depth >= MaxDepth)
41219 return SDValue();
41220
41221 // Directly rip through bitcasts to find the underlying operand.
41222 SDValue Op = SrcOps[SrcOpIndex];
41224
41225 EVT VT = Op.getValueType();
41226 if (!VT.isVector() || !VT.isSimple())
41227 return SDValue(); // Bail if we hit a non-simple non-vector.
41228
41229 // FIXME: Just bail on f16 for now.
41230 if (VT.getVectorElementType() == MVT::f16)
41231 return SDValue();
41232
41233 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41234 "Can only combine shuffles upto size of the root op.");
41235
41236 // Create a demanded elts mask from the referenced elements of Op.
41237 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41238 for (int M : RootMask) {
41239 int BaseIdx = RootMask.size() * SrcOpIndex;
41240 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41241 OpDemandedElts.setBit(M - BaseIdx);
41242 }
41243 if (RootSizeInBits != VT.getSizeInBits()) {
41244 // Op is smaller than Root - extract the demanded elts for the subvector.
41245 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41246 unsigned NumOpMaskElts = RootMask.size() / Scale;
41247 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41248 assert(OpDemandedElts
41249 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41250 .isZero() &&
41251 "Out of range elements referenced in root mask");
41252 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41253 }
41254 OpDemandedElts =
41255 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41256
41257 // Extract target shuffle mask and resolve sentinels and inputs.
41258 SmallVector<int, 64> OpMask;
41259 SmallVector<SDValue, 2> OpInputs;
41260 APInt OpUndef, OpZero;
41261 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41262 OpZero, DAG, Depth, false)) {
41263 // Shuffle inputs must not be larger than the shuffle result.
41264 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41265 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41266 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41267 }))
41268 return SDValue();
41269 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41270 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41271 !isNullConstant(Op.getOperand(1))) {
41272 SDValue SrcVec = Op.getOperand(0);
41273 int ExtractIdx = Op.getConstantOperandVal(1);
41274 unsigned NumElts = VT.getVectorNumElements();
41275 OpInputs.assign({SrcVec});
41276 OpMask.assign(NumElts, SM_SentinelUndef);
41277 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41278 OpZero = OpUndef = APInt::getZero(NumElts);
41279 } else {
41280 return SDValue();
41281 }
41282
41283 // If the shuffle result was smaller than the root, we need to adjust the
41284 // mask indices and pad the mask with undefs.
41285 if (RootSizeInBits > VT.getSizeInBits()) {
41286 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41287 unsigned OpMaskSize = OpMask.size();
41288 if (OpInputs.size() > 1) {
41289 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41290 for (int &M : OpMask) {
41291 if (M < 0)
41292 continue;
41293 int EltIdx = M % OpMaskSize;
41294 int OpIdx = M / OpMaskSize;
41295 M = (PaddedMaskSize * OpIdx) + EltIdx;
41296 }
41297 }
41298 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41299 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41300 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41301 }
41302
41305
41306 // We don't need to merge masks if the root is empty.
41307 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41308 if (EmptyRoot) {
41309 // Only resolve zeros if it will remove an input, otherwise we might end
41310 // up in an infinite loop.
41311 bool ResolveKnownZeros = true;
41312 if (!OpZero.isZero()) {
41313 APInt UsedInputs = APInt::getZero(OpInputs.size());
41314 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41315 int M = OpMask[i];
41316 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41317 continue;
41318 UsedInputs.setBit(M / OpMask.size());
41319 if (UsedInputs.isAllOnes()) {
41320 ResolveKnownZeros = false;
41321 break;
41322 }
41323 }
41324 }
41325 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41326 ResolveKnownZeros);
41327
41328 Mask = OpMask;
41329 Ops.append(OpInputs.begin(), OpInputs.end());
41330 } else {
41331 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41332
41333 // Add the inputs to the Ops list, avoiding duplicates.
41334 Ops.append(SrcOps.begin(), SrcOps.end());
41335
41336 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41337 // Attempt to find an existing match.
41339 for (int i = 0, e = Ops.size(); i < e; ++i)
41340 if (InputBC == peekThroughBitcasts(Ops[i]))
41341 return i;
41342 // Match failed - should we replace an existing Op?
41343 if (InsertionPoint >= 0) {
41345 return InsertionPoint;
41346 }
41347 // Add to the end of the Ops list.
41348 Ops.push_back(Input);
41349 return Ops.size() - 1;
41350 };
41351
41352 SmallVector<int, 2> OpInputIdx;
41353 for (SDValue OpInput : OpInputs)
41354 OpInputIdx.push_back(
41355 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41356
41357 assert(((RootMask.size() > OpMask.size() &&
41358 RootMask.size() % OpMask.size() == 0) ||
41359 (OpMask.size() > RootMask.size() &&
41360 OpMask.size() % RootMask.size() == 0) ||
41361 OpMask.size() == RootMask.size()) &&
41362 "The smaller number of elements must divide the larger.");
41363
41364 // This function can be performance-critical, so we rely on the power-of-2
41365 // knowledge that we have about the mask sizes to replace div/rem ops with
41366 // bit-masks and shifts.
41368 "Non-power-of-2 shuffle mask sizes");
41370 "Non-power-of-2 shuffle mask sizes");
41371 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41372 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41373
41374 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41375 unsigned RootRatio =
41376 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41377 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41378 assert((RootRatio == 1 || OpRatio == 1) &&
41379 "Must not have a ratio for both incoming and op masks!");
41380
41381 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41382 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41383 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41384 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41385 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41386
41387 Mask.resize(MaskWidth, SM_SentinelUndef);
41388
41389 // Merge this shuffle operation's mask into our accumulated mask. Note that
41390 // this shuffle's mask will be the first applied to the input, followed by
41391 // the root mask to get us all the way to the root value arrangement. The
41392 // reason for this order is that we are recursing up the operation chain.
41393 for (unsigned i = 0; i < MaskWidth; ++i) {
41394 unsigned RootIdx = i >> RootRatioLog2;
41395 if (RootMask[RootIdx] < 0) {
41396 // This is a zero or undef lane, we're done.
41397 Mask[i] = RootMask[RootIdx];
41398 continue;
41399 }
41400
41401 unsigned RootMaskedIdx =
41402 RootRatio == 1
41403 ? RootMask[RootIdx]
41404 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41405
41406 // Just insert the scaled root mask value if it references an input other
41407 // than the SrcOp we're currently inserting.
41408 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41409 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41410 Mask[i] = RootMaskedIdx;
41411 continue;
41412 }
41413
41414 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41415 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41416 if (OpMask[OpIdx] < 0) {
41417 // The incoming lanes are zero or undef, it doesn't matter which ones we
41418 // are using.
41419 Mask[i] = OpMask[OpIdx];
41420 continue;
41421 }
41422
41423 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41424 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41425 : (OpMask[OpIdx] << OpRatioLog2) +
41426 (RootMaskedIdx & (OpRatio - 1));
41427
41428 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41429 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41430 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41431 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41432
41433 Mask[i] = OpMaskedIdx;
41434 }
41435 }
41436
41437 // Peek through any free bitcasts to insert_subvector vector widenings or
41438 // extract_subvector nodes back to root size.
41439 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41440 for (auto [I, Op] : enumerate(Ops)) {
41441 SDValue BC = Op;
41442 while (1) {
41443 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41444 BC = BC.getOperand(0);
41445 continue;
41446 }
41447 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41448 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41449 // Set out of bounds mask indices to undef.
41450 Op = BC = BC.getOperand(1);
41451 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41452 int Lo = I * Mask.size();
41453 int Hi = (I + 1) * Mask.size();
41454 int NewHi = Lo + (Mask.size() / Scale);
41455 for (int &M : Mask) {
41456 if (Lo <= M && NewHi <= M && M < Hi)
41457 M = SM_SentinelUndef;
41458 }
41459 continue;
41460 }
41461 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41462 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41463 isNullConstant(BC.getOperand(1))) {
41464 Op = BC = BC.getOperand(0);
41465 continue;
41466 }
41467 break;
41468 }
41469 }
41470
41471 // Remove unused/repeated shuffle source ops.
41473
41474 // Handle the all undef/zero/ones cases early.
41475 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41476 return DAG.getUNDEF(RootVT);
41477 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41478 return getZeroVector(RootVT, Subtarget, DAG, DL);
41479 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41481 return getOnesVector(RootVT, DAG, DL);
41482
41483 assert(!Ops.empty() && "Shuffle with no inputs detected");
41484
41485 // Update the list of shuffle nodes that have been combined so far.
41486 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41487 CombinedNodes.push_back(Op.getNode());
41488
41489 // See if we can recurse into each shuffle source op (if it's a target
41490 // shuffle). The source op should only be generally combined if it either has
41491 // a single use (i.e. current Op) or all its users have already been combined,
41492 // if not then we can still combine but should prevent generation of variable
41493 // shuffles to avoid constant pool bloat.
41494 // Don't recurse if we already have more source ops than we can combine in
41495 // the remaining recursion depth.
41496 if (Ops.size() < (MaxDepth - Depth)) {
41497 for (int i = 0, e = Ops.size(); i < e; ++i) {
41498 // For empty roots, we need to resolve zeroable elements before combining
41499 // them with other shuffles.
41500 SmallVector<int, 64> ResolvedMask = Mask;
41501 if (EmptyRoot)
41502 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41503 bool AllowCrossLaneVar = false;
41504 bool AllowPerLaneVar = false;
41505 if (Ops[i].getNode()->hasOneUse() ||
41506 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41507 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41508 AllowPerLaneVar = AllowVariablePerLaneMask;
41509 }
41511 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41512 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41513 DAG, DL, Subtarget))
41514 return Res;
41515 }
41516 }
41517
41518 // Attempt to constant fold all of the constant source ops.
41520 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41521 return Cst;
41522
41523 // If constant fold failed and we only have constants - then we have
41524 // multiple uses by a single non-variable shuffle - just bail.
41525 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41526 APInt UndefElts;
41527 SmallVector<APInt> RawBits;
41528 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41529 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41530 RawBits,
41531 /*AllowWholeUndefs*/ true,
41532 /*AllowPartialUndefs*/ true);
41533 })) {
41534 return SDValue();
41535 }
41536
41537 // Canonicalize the combined shuffle mask chain with horizontal ops.
41538 // NOTE: This will update the Ops and Mask.
41540 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41541 return DAG.getBitcast(RootVT, HOp);
41542
41543 // Try to refine our inputs given our knowledge of target shuffle mask.
41544 for (auto I : enumerate(Ops)) {
41545 int OpIdx = I.index();
41546 SDValue &Op = I.value();
41547
41548 // What range of shuffle mask element values results in picking from Op?
41549 int Lo = OpIdx * Mask.size();
41550 int Hi = Lo + Mask.size();
41551
41552 // Which elements of Op do we demand, given the mask's granularity?
41553 APInt OpDemandedElts(Mask.size(), 0);
41554 for (int MaskElt : Mask) {
41555 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41556 int OpEltIdx = MaskElt - Lo;
41557 OpDemandedElts.setBit(OpEltIdx);
41558 }
41559 }
41560
41561 // Is the shuffle result smaller than the root?
41562 if (Op.getValueSizeInBits() < RootSizeInBits) {
41563 // We padded the mask with undefs. But we now need to undo that.
41564 unsigned NumExpectedVectorElts = Mask.size();
41565 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41566 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41567 assert(!OpDemandedElts.extractBits(
41568 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41569 "Demanding the virtual undef widening padding?");
41570 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41571 }
41572
41573 // The Op itself may be of different VT, so we need to scale the mask.
41574 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41575 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41576
41577 // Can this operand be simplified any further, given it's demanded elements?
41579 Op, OpScaledDemandedElts, DAG))
41580 Op = NewOp;
41581 }
41582 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41583
41584 // Widen any subvector shuffle inputs we've collected.
41585 // TODO: Remove this to avoid generating temporary nodes, we should only
41586 // widen once combineX86ShuffleChain has found a match.
41587 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41588 return Op.getValueSizeInBits() < RootSizeInBits;
41589 })) {
41590 for (SDValue &Op : Ops)
41591 if (Op.getValueSizeInBits() < RootSizeInBits)
41592 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41593 RootSizeInBits);
41594 // Reresolve - we might have repeated subvector sources.
41596 }
41597
41598 // Handle the all undef/zero/ones cases.
41599 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41600 return DAG.getUNDEF(RootVT);
41601 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41602 return getZeroVector(RootVT, Subtarget, DAG, DL);
41603 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41605 return getOnesVector(RootVT, DAG, DL);
41606
41607 assert(!Ops.empty() && "Shuffle with no inputs detected");
41608
41609 // We can only combine unary and binary shuffle mask cases.
41610 if (Ops.size() <= 2) {
41611 // Minor canonicalization of the accumulated shuffle mask to make it easier
41612 // to match below. All this does is detect masks with sequential pairs of
41613 // elements, and shrink them to the half-width mask. It does this in a loop
41614 // so it will reduce the size of the mask to the minimal width mask which
41615 // performs an equivalent shuffle.
41616 while (Mask.size() > 1) {
41617 SmallVector<int, 64> WidenedMask;
41618 if (!canWidenShuffleElements(Mask, WidenedMask))
41619 break;
41620 Mask = std::move(WidenedMask);
41621 }
41622
41623 // Canonicalization of binary shuffle masks to improve pattern matching by
41624 // commuting the inputs.
41625 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41627 std::swap(Ops[0], Ops[1]);
41628 }
41629
41630 // Try to combine into a single shuffle instruction.
41631 if (SDValue Shuffle = combineX86ShuffleChain(
41632 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41633 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41634 IsMaskedShuffle, DAG, DL, Subtarget))
41635 return Shuffle;
41636
41637 // If all the operands come from the same larger vector, fallthrough and try
41638 // to use combineX86ShuffleChainWithExtract.
41641 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41642 (RootSizeInBits / Mask.size()) != 64 ||
41643 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41644 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41645 LHS.getOperand(0) != RHS.getOperand(0))
41646 return SDValue();
41647 }
41648
41649 // If that failed and any input is extracted then try to combine as a
41650 // shuffle with the larger type.
41652 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41653 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41654 DAG, DL, Subtarget);
41655}
41656
41657/// Helper entry wrapper to combineX86ShufflesRecursively.
41659 const X86Subtarget &Subtarget) {
41661 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41662 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41663 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41664 SDLoc(Op), Subtarget);
41665}
41666
41667/// Get the PSHUF-style mask from PSHUF node.
41668///
41669/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41670/// PSHUF-style masks that can be reused with such instructions.
41672 MVT VT = N.getSimpleValueType();
41675 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41676 (void)HaveMask;
41677 assert(HaveMask);
41678
41679 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41680 // matter. Check that the upper masks are repeats and remove them.
41681 if (VT.getSizeInBits() > 128) {
41682 int LaneElts = 128 / VT.getScalarSizeInBits();
41683#ifndef NDEBUG
41684 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41685 for (int j = 0; j < LaneElts; ++j)
41686 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41687 "Mask doesn't repeat in high 128-bit lanes!");
41688#endif
41689 Mask.resize(LaneElts);
41690 }
41691
41692 switch (N.getOpcode()) {
41693 case X86ISD::PSHUFD:
41694 return Mask;
41695 case X86ISD::PSHUFLW:
41696 Mask.resize(4);
41697 return Mask;
41698 case X86ISD::PSHUFHW:
41699 Mask.erase(Mask.begin(), Mask.begin() + 4);
41700 for (int &M : Mask)
41701 M -= 4;
41702 return Mask;
41703 default:
41704 llvm_unreachable("No valid shuffle instruction found!");
41705 }
41706}
41707
41708/// Get the expanded blend mask from a BLENDI node.
41709/// For v16i16 nodes, this will splat the repeated i8 mask.
41711 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41712 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41713 APInt Mask = V.getConstantOperandAPInt(2);
41714 if (Mask.getBitWidth() > NumElts)
41715 Mask = Mask.trunc(NumElts);
41716 if (NumElts == 16) {
41717 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41718 Mask = APInt::getSplat(16, Mask);
41719 }
41720 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41721 return Mask;
41722}
41723
41724/// Search for a combinable shuffle across a chain ending in pshufd.
41725///
41726/// We walk up the chain and look for a combinable shuffle, skipping over
41727/// shuffles that we could hoist this shuffle's transformation past without
41728/// altering anything.
41731 const SDLoc &DL,
41732 SelectionDAG &DAG) {
41733 assert(N.getOpcode() == X86ISD::PSHUFD &&
41734 "Called with something other than an x86 128-bit half shuffle!");
41735
41736 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41737 // of the shuffles in the chain so that we can form a fresh chain to replace
41738 // this one.
41740 SDValue V = N.getOperand(0);
41741 for (; V.hasOneUse(); V = V.getOperand(0)) {
41742 switch (V.getOpcode()) {
41743 default:
41744 return SDValue(); // Nothing combined!
41745
41746 case ISD::BITCAST:
41747 // Skip bitcasts as we always know the type for the target specific
41748 // instructions.
41749 continue;
41750
41751 case X86ISD::PSHUFD:
41752 // Found another dword shuffle.
41753 break;
41754
41755 case X86ISD::PSHUFLW:
41756 // Check that the low words (being shuffled) are the identity in the
41757 // dword shuffle, and the high words are self-contained.
41758 if (Mask[0] != 0 || Mask[1] != 1 ||
41759 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41760 return SDValue();
41761
41762 Chain.push_back(V);
41763 continue;
41764
41765 case X86ISD::PSHUFHW:
41766 // Check that the high words (being shuffled) are the identity in the
41767 // dword shuffle, and the low words are self-contained.
41768 if (Mask[2] != 2 || Mask[3] != 3 ||
41769 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41770 return SDValue();
41771
41772 Chain.push_back(V);
41773 continue;
41774
41775 case X86ISD::UNPCKL:
41776 case X86ISD::UNPCKH:
41777 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41778 // shuffle into a preceding word shuffle.
41779 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41780 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41781 return SDValue();
41782
41783 // Search for a half-shuffle which we can combine with.
41784 unsigned CombineOp =
41785 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41786 if (V.getOperand(0) != V.getOperand(1) ||
41787 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41788 return SDValue();
41789 Chain.push_back(V);
41790 V = V.getOperand(0);
41791 do {
41792 switch (V.getOpcode()) {
41793 default:
41794 return SDValue(); // Nothing to combine.
41795
41796 case X86ISD::PSHUFLW:
41797 case X86ISD::PSHUFHW:
41798 if (V.getOpcode() == CombineOp)
41799 break;
41800
41801 Chain.push_back(V);
41802
41803 [[fallthrough]];
41804 case ISD::BITCAST:
41805 V = V.getOperand(0);
41806 continue;
41807 }
41808 break;
41809 } while (V.hasOneUse());
41810 break;
41811 }
41812 // Break out of the loop if we break out of the switch.
41813 break;
41814 }
41815
41816 if (!V.hasOneUse())
41817 // We fell out of the loop without finding a viable combining instruction.
41818 return SDValue();
41819
41820 // Merge this node's mask and our incoming mask.
41822 for (int &M : Mask)
41823 M = VMask[M];
41824 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41825 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41826
41827 // Rebuild the chain around this new shuffle.
41828 while (!Chain.empty()) {
41829 SDValue W = Chain.pop_back_val();
41830
41831 if (V.getValueType() != W.getOperand(0).getValueType())
41832 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41833
41834 switch (W.getOpcode()) {
41835 default:
41836 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41837
41838 case X86ISD::UNPCKL:
41839 case X86ISD::UNPCKH:
41840 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41841 break;
41842
41843 case X86ISD::PSHUFD:
41844 case X86ISD::PSHUFLW:
41845 case X86ISD::PSHUFHW:
41846 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41847 break;
41848 }
41849 }
41850 if (V.getValueType() != N.getValueType())
41851 V = DAG.getBitcast(N.getValueType(), V);
41852
41853 // Return the new chain to replace N.
41854 return V;
41855}
41856
41857// Attempt to commute shufps LHS loads:
41858// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41860 SelectionDAG &DAG) {
41861 // TODO: Add vXf64 support.
41862 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41863 return SDValue();
41864
41865 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41866 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41867 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41868 return SDValue();
41869 SDValue N0 = V.getOperand(0);
41870 SDValue N1 = V.getOperand(1);
41871 unsigned Imm = V.getConstantOperandVal(2);
41872 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41873 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41875 return SDValue();
41876 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41877 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41878 DAG.getTargetConstant(Imm, DL, MVT::i8));
41879 };
41880
41881 switch (N.getOpcode()) {
41882 case X86ISD::VPERMILPI:
41883 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41884 unsigned Imm = N.getConstantOperandVal(1);
41885 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41886 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41887 }
41888 break;
41889 case X86ISD::SHUFP: {
41890 SDValue N0 = N.getOperand(0);
41891 SDValue N1 = N.getOperand(1);
41892 unsigned Imm = N.getConstantOperandVal(2);
41893 if (N0 == N1) {
41894 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41895 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41896 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41897 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41898 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41899 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41900 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41901 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41902 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41903 }
41904 break;
41905 }
41906 }
41907
41908 return SDValue();
41909}
41910
41911// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41912// iff we don't demand the same element index for both X and Y.
41913static SDValue
41915 const APInt &DemandedElts, SelectionDAG &DAG,
41916 const X86Subtarget &Subtarget, const SDLoc &DL) {
41917 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41918 if (!N0.hasOneUse() || !N1.hasOneUse())
41919 return SDValue();
41920
41921 unsigned NumElts = VT.getVectorNumElements();
41924
41925 // See if both operands are shuffles, and that we can scale the shuffle masks
41926 // to the same width as the blend mask.
41927 // TODO: Support SM_SentinelZero?
41928 SmallVector<SDValue, 2> Ops0, Ops1;
41929 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41930 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41931 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41932 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41933 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41934 return SDValue();
41935
41936 // Determine the demanded elts from both permutes.
41937 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41938 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41939 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41940 Demanded1,
41941 /*AllowUndefElts=*/true) ||
41942 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41943 DemandedRHS0, /*AllowUndefElts=*/true) ||
41944 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41945 DemandedRHS1, /*AllowUndefElts=*/true))
41946 return SDValue();
41947
41948 // Confirm that we only use a single operand from both permutes and that we
41949 // don't demand the same index from both.
41950 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41951 DemandedLHS0.intersects(DemandedLHS1))
41952 return SDValue();
41953
41954 // Use the permute demanded elts masks as the new blend mask.
41955 // Create the new permute mask as a blend of the 2 original permute masks.
41956 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41957 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41958 for (unsigned I = 0; I != NumElts; ++I) {
41959 if (Demanded0[I]) {
41960 int M = ScaledMask0[I];
41961 if (0 <= M) {
41962 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41963 "BlendMask demands LHS AND RHS");
41964 NewBlendMask[M] = M;
41965 NewPermuteMask[I] = M;
41966 }
41967 } else if (Demanded1[I]) {
41968 int M = ScaledMask1[I];
41969 if (0 <= M) {
41970 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41971 "BlendMask demands LHS AND RHS");
41972 NewBlendMask[M] = M + NumElts;
41973 NewPermuteMask[I] = M;
41974 }
41975 }
41976 }
41977 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41978 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41979
41980 // v16i16 shuffles can explode in complexity very easily, only accept them if
41981 // the blend mask is the same in the 128-bit subvectors (or can widen to
41982 // v8i32) and the permute can be widened as well.
41983 if (VT == MVT::v16i16) {
41984 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41985 !canWidenShuffleElements(NewBlendMask))
41986 return SDValue();
41987 if (!canWidenShuffleElements(NewPermuteMask))
41988 return SDValue();
41989 }
41990
41991 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41992 // widened to a lane permute (vperm2f128).
41993 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41995 NewPermuteMask) &&
41996 !canScaleShuffleElements(NewPermuteMask, 2))
41997 return SDValue();
41998
41999 SDValue NewBlend =
42000 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42001 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42002 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42003 NewPermuteMask);
42004}
42005
42006// TODO - move this to TLI like isBinOp?
42007static bool isUnaryOp(unsigned Opcode) {
42008 switch (Opcode) {
42009 case ISD::CTLZ:
42010 case ISD::CTTZ:
42011 case ISD::CTPOP:
42012 return true;
42013 }
42014 return false;
42015}
42016
42017// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42018// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42020 const SDLoc &DL) {
42021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42022 EVT ShuffleVT = N.getValueType();
42023 unsigned Opc = N.getOpcode();
42024
42025 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42026 // AllZeros/AllOnes constants are freely shuffled and will peek through
42027 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42028 // merge with target shuffles if it has one use so shuffle combining is
42029 // likely to kick in. Shuffles of splats are expected to be removed.
42030 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42031 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42035 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42036 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42037 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42038 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42039 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42040 };
42041 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42042 // Ensure we only shuffle whole vector src elements, unless its a logical
42043 // binops where we can more aggressively move shuffles from dst to src.
42044 return isLogicOp(BinOp) ||
42045 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42046 };
42047
42048 switch (Opc) {
42049 // Unary and Unary+Permute Shuffles.
42050 case X86ISD::PSHUFB: {
42051 // Don't merge PSHUFB if it contains zero'd elements.
42052 SmallVector<int> Mask;
42054 if (!getTargetShuffleMask(N, false, Ops, Mask))
42055 break;
42056 [[fallthrough]];
42057 }
42058 case X86ISD::VBROADCAST:
42059 case X86ISD::MOVDDUP:
42060 case X86ISD::PSHUFD:
42061 case X86ISD::PSHUFHW:
42062 case X86ISD::PSHUFLW:
42063 case X86ISD::VPERMV:
42064 case X86ISD::VPERMI:
42065 case X86ISD::VPERMILPI: {
42066 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42067 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42068 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42069 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42070 unsigned SrcOpcode = N0.getOpcode();
42071 EVT OpVT = N0.getValueType();
42072 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42075 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42076 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42077 IsMergeableWithShuffle(Op01, FoldShuf)) {
42078 SDValue LHS, RHS;
42079 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42080 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42081 if (Opc == X86ISD::VPERMV) {
42082 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42083 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42084 } else if (N.getNumOperands() == 2) {
42085 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42086 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42087 } else {
42088 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42089 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42090 }
42091 return DAG.getBitcast(ShuffleVT,
42092 DAG.getNode(SrcOpcode, DL, OpVT,
42093 DAG.getBitcast(OpVT, LHS),
42094 DAG.getBitcast(OpVT, RHS)));
42095 }
42096 }
42097 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42098 OpVT.getScalarSizeInBits() ==
42100 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42101 if (Opc == X86ISD::VPERMV)
42102 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42103 else if (N.getNumOperands() == 2)
42104 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42105 else
42106 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42107 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42108 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42109 }
42110 }
42111 break;
42112 }
42113 // Binary and Binary+Permute Shuffles.
42114 case X86ISD::INSERTPS: {
42115 // Don't merge INSERTPS if it contains zero'd elements.
42116 unsigned InsertPSMask = N.getConstantOperandVal(2);
42117 unsigned ZeroMask = InsertPSMask & 0xF;
42118 if (ZeroMask != 0)
42119 break;
42120 [[fallthrough]];
42121 }
42122 case X86ISD::MOVSD:
42123 case X86ISD::MOVSS:
42124 case X86ISD::BLENDI:
42125 case X86ISD::SHUFP:
42126 case X86ISD::UNPCKH:
42127 case X86ISD::UNPCKL: {
42128 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42129 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42130 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42131 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42132 unsigned SrcOpcode = N0.getOpcode();
42133 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42134 N0.getValueType() == N1.getValueType() &&
42135 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42136 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42141 // Ensure the total number of shuffles doesn't increase by folding this
42142 // shuffle through to the source ops.
42143 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42144 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42145 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42146 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42147 SDValue LHS, RHS;
42148 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42149 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42150 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42151 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42152 if (N.getNumOperands() == 3) {
42153 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42154 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42155 } else {
42156 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42157 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42158 }
42159 EVT OpVT = N0.getValueType();
42160 return DAG.getBitcast(ShuffleVT,
42161 DAG.getNode(SrcOpcode, DL, OpVT,
42162 DAG.getBitcast(OpVT, LHS),
42163 DAG.getBitcast(OpVT, RHS)));
42164 }
42165 }
42166 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42167 N0.getValueType() == N1.getValueType() &&
42168 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42169 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42172 SDValue Res;
42173 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42174 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42175 if (N.getNumOperands() == 3) {
42176 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42177 } else {
42178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42179 }
42180 EVT OpVT = N0.getValueType();
42181 return DAG.getBitcast(
42182 ShuffleVT,
42183 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42184 }
42185 // TODO: We can generalize this for other shuffles/conversions.
42186 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42187 N1.getOpcode() == SrcOpcode &&
42188 N0.getValueType() == N1.getValueType() &&
42189 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42190 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42191 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42192 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42193 EVT OpSrcVT = N0.getOperand(0).getValueType();
42194 EVT OpDstVT = N0.getValueType();
42195 SDValue Res =
42196 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42197 return DAG.getBitcast(ShuffleVT,
42198 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42199 }
42200 }
42201 break;
42202 }
42203 }
42204 return SDValue();
42205}
42206
42207/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42209 SelectionDAG &DAG,
42210 const SDLoc &DL) {
42211 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42212
42213 MVT VT = V.getSimpleValueType();
42214 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42215 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42216 unsigned SrcOpc0 = Src0.getOpcode();
42217 unsigned SrcOpc1 = Src1.getOpcode();
42218 EVT SrcVT0 = Src0.getValueType();
42219 EVT SrcVT1 = Src1.getValueType();
42220
42221 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42222 return SDValue();
42223
42224 switch (SrcOpc0) {
42225 case X86ISD::MOVDDUP: {
42226 SDValue LHS = Src0.getOperand(0);
42227 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42228 SDValue Res =
42229 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42230 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42231 return DAG.getBitcast(VT, Res);
42232 }
42233 case X86ISD::VPERMILPI:
42234 // TODO: Handle v4f64 permutes with different low/high lane masks.
42235 if (SrcVT0 == MVT::v4f64) {
42236 uint64_t Mask = Src0.getConstantOperandVal(1);
42237 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42238 break;
42239 }
42240 [[fallthrough]];
42241 case X86ISD::VSHLI:
42242 case X86ISD::VSRLI:
42243 case X86ISD::VSRAI:
42244 case X86ISD::PSHUFD:
42245 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42246 SDValue LHS = Src0.getOperand(0);
42247 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42248 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42249 V.getOperand(2));
42250 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42251 return DAG.getBitcast(VT, Res);
42252 }
42253 break;
42254 }
42255
42256 return SDValue();
42257}
42258
42259/// Try to combine x86 target specific shuffles.
42261 SelectionDAG &DAG,
42263 const X86Subtarget &Subtarget) {
42264 using namespace SDPatternMatch;
42265
42266 MVT VT = N.getSimpleValueType();
42267 unsigned NumElts = VT.getVectorNumElements();
42269 unsigned Opcode = N.getOpcode();
42270 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42271
42272 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42273 return R;
42274
42275 // Handle specific target shuffles.
42276 switch (Opcode) {
42277 case X86ISD::MOVDDUP: {
42278 SDValue Src = N.getOperand(0);
42279 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42280 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42281 ISD::isNormalLoad(Src.getNode())) {
42282 LoadSDNode *LN = cast<LoadSDNode>(Src);
42283 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42284 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42285 DCI.CombineTo(N.getNode(), Movddup);
42286 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42288 return N; // Return N so it doesn't get rechecked!
42289 }
42290 }
42291
42292 return SDValue();
42293 }
42294 case X86ISD::VBROADCAST: {
42295 SDValue Src = N.getOperand(0);
42296 SDValue BC = peekThroughBitcasts(Src);
42297 EVT SrcVT = Src.getValueType();
42298 EVT BCVT = BC.getValueType();
42299
42300 // If broadcasting from another shuffle, attempt to simplify it.
42301 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42302 if (isTargetShuffle(BC.getOpcode()) &&
42303 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42304 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42305 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42307 for (unsigned i = 0; i != Scale; ++i)
42308 DemandedMask[i] = i;
42310 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42311 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42312 /*AllowVariableCrossLaneMask=*/true,
42313 /*AllowVariablePerLaneMask=*/true,
42314 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42315 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42316 DAG.getBitcast(SrcVT, Res));
42317 }
42318
42319 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42320 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42321 if (Src.getOpcode() == ISD::BITCAST &&
42322 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42323 TLI.isTypeLegal(BCVT) &&
42325 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42326 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42328 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42329 }
42330
42331 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42332 // If we're re-broadcasting a smaller type then broadcast with that type and
42333 // bitcast.
42334 // TODO: Do this for any splat?
42335 if (Src.getOpcode() == ISD::BITCAST &&
42336 (BC.getOpcode() == X86ISD::VBROADCAST ||
42338 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42339 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42340 MVT NewVT =
42342 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42343 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42344 }
42345
42346 // Reduce broadcast source vector to lowest 128-bits.
42347 if (SrcVT.getSizeInBits() > 128)
42348 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42349 extract128BitVector(Src, 0, DAG, DL));
42350
42351 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42352 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42353 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42354 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42355
42356 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42357 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42358 isNullConstant(Src.getOperand(1)) &&
42359 Src.getValueType() ==
42360 Src.getOperand(0).getValueType().getScalarType() &&
42361 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42362 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42363
42364 // Share broadcast with the longest vector and extract low subvector (free).
42365 // Ensure the same SDValue from the SDNode use is being used.
42366 for (SDNode *User : Src->users())
42367 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42368 Src == User->getOperand(0) &&
42369 User->getValueSizeInBits(0).getFixedValue() >
42370 VT.getFixedSizeInBits()) {
42371 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42372 VT.getSizeInBits());
42373 }
42374
42375 // vbroadcast(scalarload X) -> vbroadcast_load X
42376 // For float loads, extract other uses of the scalar from the broadcast.
42377 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42378 ISD::isNormalLoad(Src.getNode())) {
42379 LoadSDNode *LN = cast<LoadSDNode>(Src);
42380 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42381 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42382 SDValue BcastLd =
42384 LN->getMemoryVT(), LN->getMemOperand());
42385 // If the load value is used only by N, replace it via CombineTo N.
42386 bool NoReplaceExtract = Src.hasOneUse();
42387 DCI.CombineTo(N.getNode(), BcastLd);
42388 if (NoReplaceExtract) {
42389 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42391 } else {
42392 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42393 DAG.getVectorIdxConstant(0, DL));
42394 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42395 }
42396 return N; // Return N so it doesn't get rechecked!
42397 }
42398
42399 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42400 // i16. So shrink it ourselves if we can make a broadcast_load.
42401 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42402 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42403 assert(Subtarget.hasAVX2() && "Expected AVX2");
42404 SDValue TruncIn = Src.getOperand(0);
42405
42406 // If this is a truncate of a non extending load we can just narrow it to
42407 // use a broadcast_load.
42408 if (ISD::isNormalLoad(TruncIn.getNode())) {
42409 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42410 // Unless its volatile or atomic.
42411 if (LN->isSimple()) {
42412 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42413 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42414 SDValue BcastLd = DAG.getMemIntrinsicNode(
42415 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42416 LN->getPointerInfo(), LN->getBaseAlign(),
42417 LN->getMemOperand()->getFlags());
42418 DCI.CombineTo(N.getNode(), BcastLd);
42419 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42420 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42421 return N; // Return N so it doesn't get rechecked!
42422 }
42423 }
42424
42425 // If this is a truncate of an i16 extload, we can directly replace it.
42426 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42427 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42428 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42429 if (LN->getMemoryVT().getSizeInBits() == 16) {
42430 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42431 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42432 SDValue BcastLd =
42434 LN->getMemoryVT(), LN->getMemOperand());
42435 DCI.CombineTo(N.getNode(), BcastLd);
42436 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42437 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42438 return N; // Return N so it doesn't get rechecked!
42439 }
42440 }
42441
42442 // If this is a truncate of load that has been shifted right, we can
42443 // offset the pointer and use a narrower load.
42444 if (TruncIn.getOpcode() == ISD::SRL &&
42445 TruncIn.getOperand(0).hasOneUse() &&
42446 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42447 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42448 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42449 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42450 // Make sure the shift amount and the load size are divisible by 16.
42451 // Don't do this if the load is volatile or atomic.
42452 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42453 LN->isSimple()) {
42454 unsigned Offset = ShiftAmt / 8;
42455 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42458 SDValue Ops[] = { LN->getChain(), Ptr };
42459 SDValue BcastLd = DAG.getMemIntrinsicNode(
42460 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42462 LN->getMemOperand()->getFlags());
42463 DCI.CombineTo(N.getNode(), BcastLd);
42464 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42465 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42466 return N; // Return N so it doesn't get rechecked!
42467 }
42468 }
42469 }
42470
42471 // vbroadcast(vzload X) -> vbroadcast_load X
42472 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42474 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42475 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42476 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42477 SDValue BcastLd =
42479 LN->getMemoryVT(), LN->getMemOperand());
42480 DCI.CombineTo(N.getNode(), BcastLd);
42481 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42483 return N; // Return N so it doesn't get rechecked!
42484 }
42485 }
42486
42487 // vbroadcast(vector load X) -> vbroadcast_load
42488 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42489 LoadSDNode *LN = cast<LoadSDNode>(Src);
42490 // Unless the load is volatile or atomic.
42491 if (LN->isSimple()) {
42492 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42493 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42494 SDValue BcastLd = DAG.getMemIntrinsicNode(
42496 LN->getPointerInfo(), LN->getBaseAlign(),
42497 LN->getMemOperand()->getFlags());
42498 DCI.CombineTo(N.getNode(), BcastLd);
42499 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42501 return N; // Return N so it doesn't get rechecked!
42502 }
42503 }
42504
42505 return SDValue();
42506 }
42507 case X86ISD::VZEXT_MOVL: {
42508 SDValue N0 = N.getOperand(0);
42509
42510 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42511 // Zeroing out the upper elements means we're just shifting a zero value.
42512 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42513 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42514 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42515 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42516 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42517 if (N0.hasOneUse())
42518 return DAG.getNode(
42519 N0.getOpcode(), DL, VT,
42520 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42521 N0.getOperand(1));
42522 }
42523
42524 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42525 // the load is volatile.
42526 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42527 auto *LN = cast<LoadSDNode>(N0);
42528 if (SDValue VZLoad =
42529 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42530 DCI.CombineTo(N.getNode(), VZLoad);
42531 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42533 return N;
42534 }
42535 }
42536
42537 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42538 // and can just use a VZEXT_LOAD.
42539 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42540 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42541 auto *LN = cast<MemSDNode>(N0);
42542 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42543 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42544 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42545 SDValue VZLoad =
42547 LN->getMemoryVT(), LN->getMemOperand());
42548 DCI.CombineTo(N.getNode(), VZLoad);
42549 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42551 return N;
42552 }
42553 }
42554
42555 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42556 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42557 // if the upper bits of the i64 are zero.
42558 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42559 N0.getOperand(0).hasOneUse() &&
42560 N0.getOperand(0).getValueType() == MVT::i64) {
42561 SDValue In = N0.getOperand(0);
42562 APInt Mask = APInt::getHighBitsSet(64, 32);
42563 if (DAG.MaskedValueIsZero(In, Mask)) {
42564 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42565 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42566 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42567 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42568 return DAG.getBitcast(VT, Movl);
42569 }
42570 }
42571
42572 // Load a scalar integer constant directly to XMM instead of transferring an
42573 // immediate value from GPR.
42574 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42575 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42576 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42577 // Create a vector constant - scalar constant followed by zeros.
42578 EVT ScalarVT = N0.getOperand(0).getValueType();
42579 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42580 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42581 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42582 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42583
42584 // Load the vector constant from constant pool.
42585 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42586 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42587 MachinePointerInfo MPI =
42589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42590 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42592 }
42593 }
42594
42595 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42596 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42597 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42598 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42599 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42601
42602 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42603 isNullConstant(V.getOperand(2))) {
42604 SDValue In = V.getOperand(1);
42606 In.getValueSizeInBits() /
42607 VT.getScalarSizeInBits());
42608 In = DAG.getBitcast(SubVT, In);
42609 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42610 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42611 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42612 V.getOperand(2));
42613 }
42614 }
42615
42616 return SDValue();
42617 }
42618 case X86ISD::BLENDI: {
42619 SDValue N0 = N.getOperand(0);
42620 SDValue N1 = N.getOperand(1);
42621 unsigned EltBits = VT.getScalarSizeInBits();
42622
42623 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42624 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42625 // TODO: Handle MVT::v16i16 repeated blend mask.
42626 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42627 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42628 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42629 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42630 unsigned NewSize = SrcVT.getVectorNumElements();
42631 APInt BlendMask = getBLENDIBlendMask(N);
42632 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42633 return DAG.getBitcast(
42634 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42635 N1.getOperand(0),
42636 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42637 DL, MVT::i8)));
42638 }
42639 }
42640 // Share PSHUFB masks:
42641 // blend(pshufb(x,m1),pshufb(y,m2))
42642 // --> m3 = blend(m1,m2)
42643 // blend(pshufb(x,m3),pshufb(y,m3))
42644 if (N0.hasOneUse() && N1.hasOneUse()) {
42645 SmallVector<int> Mask, ByteMask;
42649 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42650 RHS.getOpcode() == X86ISD::PSHUFB &&
42651 LHS.getOperand(1) != RHS.getOperand(1) &&
42652 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42653 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42654 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42656 "BLENDI decode mismatch");
42657 MVT ShufVT = LHS.getSimpleValueType();
42658 SDValue MaskLHS = LHS.getOperand(1);
42659 SDValue MaskRHS = RHS.getOperand(1);
42660 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42662 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42663 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42664 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42665 LHS.getOperand(0), NewMask);
42666 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42667 RHS.getOperand(0), NewMask);
42668 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42669 DAG.getBitcast(VT, NewLHS),
42670 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42671 }
42672 }
42673 }
42674 }
42675 return SDValue();
42676 }
42677 case X86ISD::SHUFP: {
42678 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42679 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42680 // TODO: Support types other than v4f32.
42681 if (VT == MVT::v4f32) {
42682 bool Updated = false;
42683 SmallVector<int> Mask;
42685 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42686 for (int i = 0; i != 2; ++i) {
42687 SmallVector<SDValue> SubOps;
42688 SmallVector<int> SubMask, SubScaledMask;
42690 // TODO: Scaling might be easier if we specify the demanded elts.
42691 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42692 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42693 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42694 int Ofs = i * 2;
42695 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42696 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42697 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42698 Updated = true;
42699 }
42700 }
42701 }
42702 if (Updated) {
42703 for (int &M : Mask)
42704 M %= 4;
42705 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42706 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42707 }
42708 }
42709 return SDValue();
42710 }
42711 case X86ISD::VPERMI: {
42712 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42713 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42714 SDValue N0 = N.getOperand(0);
42715 SDValue N1 = N.getOperand(1);
42716 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42717 if (N0.getOpcode() == ISD::BITCAST &&
42718 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42719 SDValue Src = N0.getOperand(0);
42720 EVT SrcVT = Src.getValueType();
42721 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42722 return DAG.getBitcast(VT, Res);
42723 }
42724 return SDValue();
42725 }
42726 case X86ISD::SHUF128: {
42727 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42728 // see if we can peek through and access the subvector directly.
42729 if (VT.is512BitVector()) {
42730 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42731 // the upper subvector is used.
42732 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42733 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42734 uint64_t Mask = N->getConstantOperandVal(2);
42735 SmallVector<SDValue> LHSOps, RHSOps;
42736 SDValue NewLHS, NewRHS;
42737 if ((Mask & 0x0A) == 0x0A &&
42738 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42739 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42740 Mask &= ~0x0A;
42741 }
42742 if ((Mask & 0xA0) == 0xA0 &&
42743 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42744 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42745 Mask &= ~0xA0;
42746 }
42747 if (NewLHS || NewRHS)
42748 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42749 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42750 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42751 DAG.getTargetConstant(Mask, DL, MVT::i8));
42752 }
42753 return SDValue();
42754 }
42755 case X86ISD::VPERM2X128: {
42756 SDValue LHS = N->getOperand(0);
42757 SDValue RHS = N->getOperand(1);
42758 unsigned Imm = N.getConstantOperandVal(2) & 255;
42759
42760 // Canonicalize unary/repeated operands to LHS.
42761 if (LHS.isUndef() && !RHS.isUndef())
42762 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42763 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42764 if (LHS == RHS)
42765 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42766 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42767
42768 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42769 if (LHS.getOpcode() == ISD::BITCAST &&
42770 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42771 EVT SrcVT = LHS.getOperand(0).getValueType();
42772 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42773 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42774 DAG.getBitcast(SrcVT, LHS),
42775 DAG.getBitcast(SrcVT, RHS),
42776 N->getOperand(2)));
42777 }
42778 }
42779
42780 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42782 return Res;
42783
42784 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42785 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42786 auto FindSubVector128 = [&](unsigned Idx) {
42787 if (Idx > 3)
42788 return SDValue();
42789 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42790 SmallVector<SDValue> SubOps;
42791 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42792 return SubOps[Idx & 1];
42793 unsigned NumElts = Src.getValueType().getVectorNumElements();
42794 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42795 Src.getOperand(1).getValueSizeInBits() == 128 &&
42796 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42797 return Src.getOperand(1);
42798 }
42799 return SDValue();
42800 };
42801 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42802 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42803 MVT SubVT = VT.getHalfNumVectorElementsVT();
42804 SubLo = DAG.getBitcast(SubVT, SubLo);
42805 SubHi = DAG.getBitcast(SubVT, SubHi);
42806 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42807 }
42808 }
42809
42810 // Attempt to match VBROADCAST*128 subvector broadcast load.
42811 if (RHS.isUndef()) {
42813 DecodeVPERM2X128Mask(4, Imm, Mask);
42814 if (isUndefOrInRange(Mask, 0, 4)) {
42815 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42816 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42817 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42818 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42819 MVT MemVT = VT.getHalfNumVectorElementsVT();
42820 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42822 cast<LoadSDNode>(LHS), Ofs, DAG);
42823 }
42824 }
42825 }
42826
42827 return SDValue();
42828 }
42829 case X86ISD::PSHUFD:
42830 case X86ISD::PSHUFLW:
42831 case X86ISD::PSHUFHW: {
42832 SDValue N0 = N.getOperand(0);
42833 SDValue N1 = N.getOperand(1);
42834 if (N0->hasOneUse()) {
42836 switch (V.getOpcode()) {
42837 case X86ISD::VSHL:
42838 case X86ISD::VSRL:
42839 case X86ISD::VSRA:
42840 case X86ISD::VSHLI:
42841 case X86ISD::VSRLI:
42842 case X86ISD::VSRAI:
42843 case X86ISD::VROTLI:
42844 case X86ISD::VROTRI: {
42845 MVT InnerVT = V.getSimpleValueType();
42846 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42847 SDValue Res = DAG.getNode(Opcode, DL, VT,
42848 DAG.getBitcast(VT, V.getOperand(0)), N1);
42849 Res = DAG.getBitcast(InnerVT, Res);
42850 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42851 return DAG.getBitcast(VT, Res);
42852 }
42853 break;
42854 }
42855 }
42856 }
42857
42858 Mask = getPSHUFShuffleMask(N);
42859 assert(Mask.size() == 4);
42860 break;
42861 }
42862 case X86ISD::MOVSD:
42863 case X86ISD::MOVSH:
42864 case X86ISD::MOVSS: {
42865 SDValue N0 = N.getOperand(0);
42866 SDValue N1 = N.getOperand(1);
42867
42868 // Canonicalize scalar FPOps:
42869 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42870 // If commutable, allow OP(N1[0], N0[0]).
42871 unsigned Opcode1 = N1.getOpcode();
42872 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42873 Opcode1 == ISD::FDIV) {
42874 SDValue N10 = N1.getOperand(0);
42875 SDValue N11 = N1.getOperand(1);
42876 if (N10 == N0 ||
42877 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42878 if (N10 != N0)
42879 std::swap(N10, N11);
42880 MVT SVT = VT.getVectorElementType();
42881 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42882 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42883 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42884 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42885 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42886 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42887 }
42888 }
42889
42890 return SDValue();
42891 }
42892 case X86ISD::INSERTPS: {
42893 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42894 SDValue Op0 = N.getOperand(0);
42895 SDValue Op1 = N.getOperand(1);
42896 unsigned InsertPSMask = N.getConstantOperandVal(2);
42897 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42898 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42899 unsigned ZeroMask = InsertPSMask & 0xF;
42900
42901 // If we zero out all elements from Op0 then we don't need to reference it.
42902 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42903 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42904 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42905
42906 // If we zero out the element from Op1 then we don't need to reference it.
42907 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42908 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42909 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42910
42911 // Attempt to merge insertps Op1 with an inner target shuffle node.
42912 SmallVector<int, 8> TargetMask1;
42914 APInt KnownUndef1, KnownZero1;
42915 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42916 KnownZero1)) {
42917 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42918 // Zero/UNDEF insertion - zero out element and remove dependency.
42919 InsertPSMask |= (1u << DstIdx);
42920 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42921 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42922 }
42923 // Update insertps mask srcidx and reference the source input directly.
42924 int M = TargetMask1[SrcIdx];
42925 assert(0 <= M && M < 8 && "Shuffle index out of range");
42926 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42927 Op1 = Ops1[M < 4 ? 0 : 1];
42928 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42929 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42930 }
42931
42932 // Attempt to merge insertps Op0 with an inner target shuffle node.
42933 SmallVector<int, 8> TargetMask0;
42935 APInt KnownUndef0, KnownZero0;
42936 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42937 KnownZero0)) {
42938 bool Updated = false;
42939 bool UseInput00 = false;
42940 bool UseInput01 = false;
42941 for (int i = 0; i != 4; ++i) {
42942 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42943 // No change if element is already zero or the inserted element.
42944 continue;
42945 }
42946
42947 if (KnownUndef0[i] || KnownZero0[i]) {
42948 // If the target mask is undef/zero then we must zero the element.
42949 InsertPSMask |= (1u << i);
42950 Updated = true;
42951 continue;
42952 }
42953
42954 // The input vector element must be inline.
42955 int M = TargetMask0[i];
42956 if (M != i && M != (i + 4))
42957 return SDValue();
42958
42959 // Determine which inputs of the target shuffle we're using.
42960 UseInput00 |= (0 <= M && M < 4);
42961 UseInput01 |= (4 <= M);
42962 }
42963
42964 // If we're not using both inputs of the target shuffle then use the
42965 // referenced input directly.
42966 if (UseInput00 && !UseInput01) {
42967 Updated = true;
42968 Op0 = Ops0[0];
42969 } else if (!UseInput00 && UseInput01) {
42970 Updated = true;
42971 Op0 = Ops0[1];
42972 }
42973
42974 if (Updated)
42975 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42976 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42977 }
42978
42979 // If we're inserting an element from a vbroadcast load, fold the
42980 // load into the X86insertps instruction. We need to convert the scalar
42981 // load to a vector and clear the source lane of the INSERTPS control.
42982 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42983 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42984 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42985 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42986 MemIntr->getBasePtr(),
42987 MemIntr->getMemOperand());
42988 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42990 Load),
42991 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42992 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42993 return Insert;
42994 }
42995 }
42996
42997 return SDValue();
42998 }
42999 case X86ISD::VPERMV: {
43000 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43002 SmallVector<SDValue, 2> SrcOps, SubOps;
43003 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43004 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43005 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43006 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43007 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43008 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43009 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43010 "Unexpected split ops");
43011 // Bail if we were permuting a widened vector.
43012 if (SubOps[1].isUndef() &&
43013 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43014 return SDValue();
43015 // Bail if any subops would have folded into the concat.
43016 if (any_of(SubOps, isShuffleFoldableLoad))
43017 return SDValue();
43018 // Concat 4x128 back to 2x256.
43019 if (SubOps.size() == 4) {
43020 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43021 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43022 }
43023 // Convert mask to 2 operand shuffle.
43024 int HalfElts = NumElts / 2;
43025 for (int &M : Mask)
43026 M += M >= HalfElts ? HalfElts : 0;
43027 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43028 VT.getSizeInBits());
43029 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43030 VT.getSizeInBits());
43031 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43032 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43033 }
43034 return SDValue();
43035 }
43036 case X86ISD::VPERMV3: {
43037 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43038 bool CanConcat = VT.is128BitVector() ||
43039 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43042 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43043 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43044 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43045 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43046 // Canonicalize to VPERMV if both sources are the same.
43047 if (V1 == V2) {
43048 for (int &M : Mask)
43049 M = (M < 0 ? M : (M & (NumElts - 1)));
43050 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43051 DAG.getUNDEF(VT), Subtarget, DAG);
43052 }
43053 // If sources are half width, then concat and use VPERMV with adjusted
43054 // mask.
43055 SDValue Ops[2];
43056 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43057 if (sd_match(V1,
43059 sd_match(V2,
43061 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43062 if (SDValue ConcatSrc =
43063 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43064 for (int &M : Mask)
43065 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43066 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43067 DAG.getUNDEF(VT), Subtarget, DAG);
43068 }
43069 }
43070 // Commute foldable source to the RHS.
43071 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43072 !isShuffleFoldableLoad(N.getOperand(2))) {
43074 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43075 N.getOperand(0), Subtarget, DAG);
43076 }
43077 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43078 // freely concatenated, with a commuted shuffle mask.
43079 if (CanConcat) {
43080 if (SDValue ConcatSrc = combineConcatVectorOps(
43081 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43082 Subtarget)) {
43084 Mask.append(NumElts, SM_SentinelUndef);
43085 SDValue Perm =
43086 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43087 DAG.getUNDEF(WideVT), Subtarget, DAG);
43088 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43089 DAG.getVectorIdxConstant(0, DL));
43090 }
43091 }
43092 }
43093 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43094 // freely concatenated.
43095 if (CanConcat) {
43096 if (SDValue ConcatSrc = combineConcatVectorOps(
43097 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43098 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43099 DL, WideVT.getSizeInBits());
43100 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43101 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43102 DAG.getVectorIdxConstant(0, DL));
43103 }
43104 }
43105 return SDValue();
43106 }
43107 default:
43108 return SDValue();
43109 }
43110
43111 // Nuke no-op shuffles that show up after combining.
43112 if (isNoopShuffleMask(Mask))
43113 return N.getOperand(0);
43114
43115 // Look for simplifications involving one or two shuffle instructions.
43116 SDValue V = N.getOperand(0);
43117 switch (N.getOpcode()) {
43118 default:
43119 break;
43120 case X86ISD::PSHUFLW:
43121 case X86ISD::PSHUFHW:
43122 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43123
43124 // See if this reduces to a PSHUFD which is no more expensive and can
43125 // combine with more operations. Note that it has to at least flip the
43126 // dwords as otherwise it would have been removed as a no-op.
43127 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43128 int DMask[] = {0, 1, 2, 3};
43129 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43130 DMask[DOffset + 0] = DOffset + 1;
43131 DMask[DOffset + 1] = DOffset + 0;
43132 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43133 V = DAG.getBitcast(DVT, V);
43134 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43135 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43136 return DAG.getBitcast(VT, V);
43137 }
43138
43139 // Look for shuffle patterns which can be implemented as a single unpack.
43140 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43141 // only works when we have a PSHUFD followed by two half-shuffles.
43142 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43143 (V.getOpcode() == X86ISD::PSHUFLW ||
43144 V.getOpcode() == X86ISD::PSHUFHW) &&
43145 V.getOpcode() != N.getOpcode() &&
43146 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43147 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43148 if (D.getOpcode() == X86ISD::PSHUFD) {
43151 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43152 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43153 int WordMask[8];
43154 for (int i = 0; i < 4; ++i) {
43155 WordMask[i + NOffset] = Mask[i] + NOffset;
43156 WordMask[i + VOffset] = VMask[i] + VOffset;
43157 }
43158 // Map the word mask through the DWord mask.
43159 int MappedMask[8];
43160 for (int i = 0; i < 8; ++i)
43161 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43162 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43163 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43164 // We can replace all three shuffles with an unpack.
43165 V = DAG.getBitcast(VT, D.getOperand(0));
43166 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43168 DL, VT, V, V);
43169 }
43170 }
43171 }
43172
43173 break;
43174
43175 case X86ISD::PSHUFD:
43176 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43177 return NewN;
43178
43179 break;
43180 }
43181
43182 return SDValue();
43183}
43184
43185/// Checks if the shuffle mask takes subsequent elements
43186/// alternately from two vectors.
43187/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43188static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43189
43190 int ParitySrc[2] = {-1, -1};
43191 unsigned Size = Mask.size();
43192 for (unsigned i = 0; i != Size; ++i) {
43193 int M = Mask[i];
43194 if (M < 0)
43195 continue;
43196
43197 // Make sure we are using the matching element from the input.
43198 if ((M % Size) != i)
43199 return false;
43200
43201 // Make sure we use the same input for all elements of the same parity.
43202 int Src = M / Size;
43203 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43204 return false;
43205 ParitySrc[i % 2] = Src;
43206 }
43207
43208 // Make sure each input is used.
43209 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43210 return false;
43211
43212 Op0Even = ParitySrc[0] == 0;
43213 return true;
43214}
43215
43216/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43217/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43218/// are written to the parameters \p Opnd0 and \p Opnd1.
43219///
43220/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43221/// so it is easier to generically match. We also insert dummy vector shuffle
43222/// nodes for the operands which explicitly discard the lanes which are unused
43223/// by this operation to try to flow through the rest of the combiner
43224/// the fact that they're unused.
43225static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43226 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43227 bool &IsSubAdd, bool &HasAllowContract) {
43228
43229 EVT VT = N->getValueType(0);
43230 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43231 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43233 return false;
43234
43235 // We only handle target-independent shuffles.
43236 // FIXME: It would be easy and harmless to use the target shuffle mask
43237 // extraction tool to support more.
43238 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43239 return false;
43240
43241 SDValue V1 = N->getOperand(0);
43242 SDValue V2 = N->getOperand(1);
43243
43244 // Make sure we have an FADD and an FSUB.
43245 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43246 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43247 V1.getOpcode() == V2.getOpcode())
43248 return false;
43249
43250 // If there are other uses of these operations we can't fold them.
43251 if (!V1->hasOneUse() || !V2->hasOneUse())
43252 return false;
43253
43254 // Ensure that both operations have the same operands. Note that we can
43255 // commute the FADD operands.
43256 SDValue LHS, RHS;
43257 if (V1.getOpcode() == ISD::FSUB) {
43258 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43259 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43260 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43261 return false;
43262 } else {
43263 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43264 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43265 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43266 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43267 return false;
43268 }
43269
43270 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43271 bool Op0Even;
43272 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43273 return false;
43274
43275 // It's a subadd if the vector in the even parity is an FADD.
43276 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43277 : V2->getOpcode() == ISD::FADD;
43278 HasAllowContract =
43280
43281 Opnd0 = LHS;
43282 Opnd1 = RHS;
43283 return true;
43284}
43285
43286/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43288 const X86Subtarget &Subtarget,
43289 SelectionDAG &DAG) {
43290 // We only handle target-independent shuffles.
43291 // FIXME: It would be easy and harmless to use the target shuffle mask
43292 // extraction tool to support more.
43293 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43294 return SDValue();
43295
43296 MVT VT = N->getSimpleValueType(0);
43297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43298 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43299 return SDValue();
43300
43301 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43302 SDValue Op0 = N->getOperand(0);
43303 SDValue Op1 = N->getOperand(1);
43304 SDValue FMAdd = Op0, FMSub = Op1;
43305 if (FMSub.getOpcode() != X86ISD::FMSUB)
43306 std::swap(FMAdd, FMSub);
43307
43308 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43309 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43310 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43311 FMAdd.getOperand(2) != FMSub.getOperand(2))
43312 return SDValue();
43313
43314 // Check for correct shuffle mask.
43315 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43316 bool Op0Even;
43317 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43318 return SDValue();
43319
43320 // FMAddSub takes zeroth operand from FMSub node.
43321 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43322 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43323 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43324 FMAdd.getOperand(2));
43325}
43326
43327/// Try to combine a shuffle into a target-specific add-sub or
43328/// mul-add-sub node.
43330 const X86Subtarget &Subtarget,
43331 SelectionDAG &DAG) {
43332 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43333 return V;
43334
43335 SDValue Opnd0, Opnd1;
43336 bool IsSubAdd;
43337 bool HasAllowContract;
43338 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43339 HasAllowContract))
43340 return SDValue();
43341
43342 MVT VT = N->getSimpleValueType(0);
43343
43344 // Try to generate X86ISD::FMADDSUB node here.
43345 SDValue Opnd2;
43346 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43347 HasAllowContract)) {
43348 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43349 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43350 }
43351
43352 if (IsSubAdd)
43353 return SDValue();
43354
43355 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43356 // the ADDSUB idiom has been successfully recognized. There are no known
43357 // X86 targets with 512-bit ADDSUB instructions!
43358 if (VT.is512BitVector())
43359 return SDValue();
43360
43361 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43362 // the ADDSUB idiom has been successfully recognized. There are no known
43363 // X86 targets with FP16 ADDSUB instructions!
43364 if (VT.getVectorElementType() == MVT::f16)
43365 return SDValue();
43366
43367 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43368}
43369
43370/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43371/// low half of each source vector and does not set any high half elements in
43372/// the destination vector, narrow the shuffle to half its original size.
43374 EVT VT = Shuf->getValueType(0);
43375 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43376 return SDValue();
43377 if (!VT.is256BitVector() && !VT.is512BitVector())
43378 return SDValue();
43379
43380 // See if we can ignore all of the high elements of the shuffle.
43381 ArrayRef<int> Mask = Shuf->getMask();
43382 if (!isUndefUpperHalf(Mask))
43383 return SDValue();
43384
43385 // Check if the shuffle mask accesses only the low half of each input vector
43386 // (half-index output is 0 or 2).
43387 int HalfIdx1, HalfIdx2;
43388 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43389 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43390 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43391 return SDValue();
43392
43393 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43394 // The trick is knowing that all of the insert/extract are actually free
43395 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43396 // of narrow inputs into a narrow output, and that is always cheaper than
43397 // the wide shuffle that we started with.
43398 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43399 Shuf->getOperand(1), HalfMask, HalfIdx1,
43400 HalfIdx2, false, DAG, /*UseConcat*/ true);
43401}
43402
43405 const X86Subtarget &Subtarget) {
43406 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43407 if (SDValue V = narrowShuffle(Shuf, DAG))
43408 return V;
43409
43410 // If we have legalized the vector types, look for blends of FADD and FSUB
43411 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43412 SDLoc dl(N);
43413 EVT VT = N->getValueType(0);
43414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43415 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43416 if (SDValue AddSub =
43417 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43418 return AddSub;
43419
43420 // Attempt to combine into a vector load/broadcast.
43422 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43423 return LD;
43424
43425 if (isTargetShuffle(N->getOpcode())) {
43426 SDValue Op(N, 0);
43427 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43428 return Shuffle;
43429
43430 // Try recursively combining arbitrary sequences of x86 shuffle
43431 // instructions into higher-order shuffles. We do this after combining
43432 // specific PSHUF instruction sequences into their minimal form so that we
43433 // can evaluate how many specialized shuffle instructions are involved in
43434 // a particular chain.
43435 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43436 return Res;
43437
43438 // Simplify source operands based on shuffle mask.
43439 // TODO - merge this into combineX86ShufflesRecursively.
43440 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43441 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43442 return SDValue(N, 0);
43443
43444 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43445 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43446 // Perform this after other shuffle combines to allow inner shuffles to be
43447 // combined away first.
43448 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43449 return BinOp;
43450 }
43451
43452 return SDValue();
43453}
43454
43455// Simplify variable target shuffle masks based on the demanded elements.
43456// TODO: Handle DemandedBits in mask indices as well?
43458 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43459 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43460 // If we're demanding all elements don't bother trying to simplify the mask.
43461 unsigned NumElts = DemandedElts.getBitWidth();
43462 if (DemandedElts.isAllOnes())
43463 return false;
43464
43465 SDValue Mask = Op.getOperand(MaskIndex);
43466 if (!Mask.hasOneUse())
43467 return false;
43468
43469 // Attempt to generically simplify the variable shuffle mask.
43470 APInt MaskUndef, MaskZero;
43471 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43472 Depth + 1))
43473 return true;
43474
43475 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43476 // TODO: Support other types from getTargetShuffleMaskIndices?
43478 EVT BCVT = BC.getValueType();
43479 auto *Load = dyn_cast<LoadSDNode>(BC);
43480 if (!Load || !Load->getBasePtr().hasOneUse())
43481 return false;
43482
43483 const Constant *C = getTargetConstantFromNode(Load);
43484 if (!C)
43485 return false;
43486
43487 Type *CTy = C->getType();
43488 if (!CTy->isVectorTy() ||
43489 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43490 return false;
43491
43492 // Handle scaling for i64 elements on 32-bit targets.
43493 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43494 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43495 return false;
43496 unsigned Scale = NumCstElts / NumElts;
43497
43498 // Simplify mask if we have an undemanded element that is not undef.
43499 bool Simplified = false;
43500 SmallVector<Constant *, 32> ConstVecOps;
43501 for (unsigned i = 0; i != NumCstElts; ++i) {
43502 Constant *Elt = C->getAggregateElement(i);
43503 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43504 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43505 Simplified = true;
43506 continue;
43507 }
43508 ConstVecOps.push_back(Elt);
43509 }
43510 if (!Simplified)
43511 return false;
43512
43513 // Generate new constant pool entry + legalize immediately for the load.
43514 SDLoc DL(Op);
43515 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43516 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43517 SDValue NewMask = TLO.DAG.getLoad(
43518 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43520 Load->getAlign());
43521 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43522}
43523
43525 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43526 TargetLoweringOpt &TLO, unsigned Depth) const {
43527 int NumElts = DemandedElts.getBitWidth();
43528 unsigned Opc = Op.getOpcode();
43529 EVT VT = Op.getValueType();
43530
43531 // Handle special case opcodes.
43532 switch (Opc) {
43533 case X86ISD::PMULDQ:
43534 case X86ISD::PMULUDQ: {
43535 APInt LHSUndef, LHSZero;
43536 APInt RHSUndef, RHSZero;
43537 SDValue LHS = Op.getOperand(0);
43538 SDValue RHS = Op.getOperand(1);
43539 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43540 Depth + 1))
43541 return true;
43542 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43543 Depth + 1))
43544 return true;
43545 // Multiply by zero.
43546 KnownZero = LHSZero | RHSZero;
43547 break;
43548 }
43549 case X86ISD::VPMADDUBSW:
43550 case X86ISD::VPMADDWD: {
43551 APInt LHSUndef, LHSZero;
43552 APInt RHSUndef, RHSZero;
43553 SDValue LHS = Op.getOperand(0);
43554 SDValue RHS = Op.getOperand(1);
43555 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43556
43557 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43558 Depth + 1))
43559 return true;
43560 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43561 Depth + 1))
43562 return true;
43563
43564 // TODO: Multiply by zero.
43565
43566 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43567 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43568 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43569 Depth + 1))
43570 return true;
43571 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43572 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43573 Depth + 1))
43574 return true;
43575 break;
43576 }
43577 case X86ISD::PSADBW: {
43578 SDValue LHS = Op.getOperand(0);
43579 SDValue RHS = Op.getOperand(1);
43580 assert(VT.getScalarType() == MVT::i64 &&
43581 LHS.getValueType() == RHS.getValueType() &&
43582 LHS.getValueType().getScalarType() == MVT::i8 &&
43583 "Unexpected PSADBW types");
43584
43585 // Aggressively peek through ops to get at the demanded elts.
43586 if (!DemandedElts.isAllOnes()) {
43587 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43588 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43590 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43592 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43593 if (NewLHS || NewRHS) {
43594 NewLHS = NewLHS ? NewLHS : LHS;
43595 NewRHS = NewRHS ? NewRHS : RHS;
43596 return TLO.CombineTo(
43597 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43598 }
43599 }
43600 break;
43601 }
43602 case X86ISD::VSHL:
43603 case X86ISD::VSRL:
43604 case X86ISD::VSRA: {
43605 // We only need the bottom 64-bits of the (128-bit) shift amount.
43606 SDValue Amt = Op.getOperand(1);
43607 MVT AmtVT = Amt.getSimpleValueType();
43608 assert(AmtVT.is128BitVector() && "Unexpected value type");
43609
43610 // If we reuse the shift amount just for sse shift amounts then we know that
43611 // only the bottom 64-bits are only ever used.
43612 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43613 unsigned UseOpc = Use->getOpcode();
43614 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43615 UseOpc == X86ISD::VSRA) &&
43616 Use->getOperand(0) != Amt;
43617 });
43618
43619 APInt AmtUndef, AmtZero;
43620 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43621 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43622 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43623 Depth + 1, AssumeSingleUse))
43624 return true;
43625 [[fallthrough]];
43626 }
43627 case X86ISD::VSHLI:
43628 case X86ISD::VSRLI:
43629 case X86ISD::VSRAI: {
43630 SDValue Src = Op.getOperand(0);
43631 APInt SrcUndef;
43632 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43633 Depth + 1))
43634 return true;
43635
43636 // Fold shift(0,x) -> 0
43637 if (DemandedElts.isSubsetOf(KnownZero))
43638 return TLO.CombineTo(
43639 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43640
43641 // Aggressively peek through ops to get at the demanded elts.
43642 if (!DemandedElts.isAllOnes())
43644 Src, DemandedElts, TLO.DAG, Depth + 1))
43645 return TLO.CombineTo(
43646 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43647 break;
43648 }
43649 case X86ISD::VPSHA:
43650 case X86ISD::VPSHL:
43651 case X86ISD::VSHLV:
43652 case X86ISD::VSRLV:
43653 case X86ISD::VSRAV: {
43654 APInt LHSUndef, LHSZero;
43655 APInt RHSUndef, RHSZero;
43656 SDValue LHS = Op.getOperand(0);
43657 SDValue RHS = Op.getOperand(1);
43658 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43659 Depth + 1))
43660 return true;
43661
43662 // Fold shift(0,x) -> 0
43663 if (DemandedElts.isSubsetOf(LHSZero))
43664 return TLO.CombineTo(
43665 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43666
43667 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43668 Depth + 1))
43669 return true;
43670
43671 KnownZero = LHSZero;
43672 break;
43673 }
43674 case X86ISD::CMPM:
43675 case X86ISD::CMPP: {
43676 // Scalarize packed fp comparison if we only require element 0.
43677 if (DemandedElts == 1) {
43678 SDLoc dl(Op);
43679 MVT VT = Op.getSimpleValueType();
43680 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43681 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43682 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43683 SDValue CC = Op.getOperand(2);
43684 if (Opc == X86ISD::CMPM) {
43685 SDValue Cmp =
43686 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43687 return TLO.CombineTo(
43688 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43689 }
43690 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43691 return TLO.CombineTo(Op,
43692 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43693 }
43694 break;
43695 }
43696 case X86ISD::PCMPEQ:
43697 case X86ISD::PCMPGT: {
43698 APInt LHSUndef, LHSZero;
43699 APInt RHSUndef, RHSZero;
43700 SDValue LHS = Op.getOperand(0);
43701 SDValue RHS = Op.getOperand(1);
43702 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43703 Depth + 1))
43704 return true;
43705 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43706 Depth + 1))
43707 return true;
43708 break;
43709 }
43710 case X86ISD::KSHIFTL: {
43711 SDValue Src = Op.getOperand(0);
43712 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43713 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43714 unsigned ShiftAmt = Amt->getZExtValue();
43715
43716 if (ShiftAmt == 0)
43717 return TLO.CombineTo(Op, Src);
43718
43719 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the bottom bits (which are shifted
43721 // out) are never demanded.
43722 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43723 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43724 unsigned C1 = Src.getConstantOperandVal(1);
43725 unsigned NewOpc = X86ISD::KSHIFTL;
43726 int Diff = ShiftAmt - C1;
43727 if (Diff < 0) {
43728 Diff = -Diff;
43729 NewOpc = X86ISD::KSHIFTR;
43730 }
43731
43732 SDLoc dl(Op);
43733 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43734 return TLO.CombineTo(
43735 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43736 }
43737 }
43738
43739 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43740 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43741 Depth + 1))
43742 return true;
43743
43744 KnownUndef <<= ShiftAmt;
43745 KnownZero <<= ShiftAmt;
43746 KnownZero.setLowBits(ShiftAmt);
43747 break;
43748 }
43749 case X86ISD::KSHIFTR: {
43750 SDValue Src = Op.getOperand(0);
43751 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43752 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43753 unsigned ShiftAmt = Amt->getZExtValue();
43754
43755 if (ShiftAmt == 0)
43756 return TLO.CombineTo(Op, Src);
43757
43758 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43759 // single shift. We can do this if the top bits (which are shifted
43760 // out) are never demanded.
43761 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43762 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43763 unsigned C1 = Src.getConstantOperandVal(1);
43764 unsigned NewOpc = X86ISD::KSHIFTR;
43765 int Diff = ShiftAmt - C1;
43766 if (Diff < 0) {
43767 Diff = -Diff;
43768 NewOpc = X86ISD::KSHIFTL;
43769 }
43770
43771 SDLoc dl(Op);
43772 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43773 return TLO.CombineTo(
43774 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43775 }
43776 }
43777
43778 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43779 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43780 Depth + 1))
43781 return true;
43782
43783 KnownUndef.lshrInPlace(ShiftAmt);
43784 KnownZero.lshrInPlace(ShiftAmt);
43785 KnownZero.setHighBits(ShiftAmt);
43786 break;
43787 }
43788 case X86ISD::ANDNP: {
43789 // ANDNP = (~LHS & RHS);
43790 SDValue LHS = Op.getOperand(0);
43791 SDValue RHS = Op.getOperand(1);
43792
43793 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43794 APInt UndefElts;
43795 SmallVector<APInt> EltBits;
43796 int NumElts = VT.getVectorNumElements();
43797 int EltSizeInBits = VT.getScalarSizeInBits();
43798 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43799 APInt OpElts = DemandedElts;
43800 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43801 EltBits)) {
43802 OpBits.clearAllBits();
43803 OpElts.clearAllBits();
43804 for (int I = 0; I != NumElts; ++I) {
43805 if (!DemandedElts[I])
43806 continue;
43807 if (UndefElts[I]) {
43808 // We can't assume an undef src element gives an undef dst - the
43809 // other src might be zero.
43810 OpBits.setAllBits();
43811 OpElts.setBit(I);
43812 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43813 (!Invert && !EltBits[I].isZero())) {
43814 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43815 OpElts.setBit(I);
43816 }
43817 }
43818 }
43819 return std::make_pair(OpBits, OpElts);
43820 };
43821 APInt BitsLHS, EltsLHS;
43822 APInt BitsRHS, EltsRHS;
43823 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43824 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43825
43826 APInt LHSUndef, LHSZero;
43827 APInt RHSUndef, RHSZero;
43828 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43829 Depth + 1))
43830 return true;
43831 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43832 Depth + 1))
43833 return true;
43834
43835 if (!DemandedElts.isAllOnes()) {
43836 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43837 TLO.DAG, Depth + 1);
43838 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43839 TLO.DAG, Depth + 1);
43840 if (NewLHS || NewRHS) {
43841 NewLHS = NewLHS ? NewLHS : LHS;
43842 NewRHS = NewRHS ? NewRHS : RHS;
43843 return TLO.CombineTo(
43844 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43845 }
43846 }
43847 break;
43848 }
43849 case X86ISD::CVTSI2P:
43850 case X86ISD::CVTUI2P:
43851 case X86ISD::CVTPH2PS:
43852 case X86ISD::CVTPS2PH: {
43853 SDValue Src = Op.getOperand(0);
43854 EVT SrcVT = Src.getValueType();
43855 APInt SrcUndef, SrcZero;
43856 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43857 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43858 Depth + 1))
43859 return true;
43860 break;
43861 }
43862 case X86ISD::PACKSS:
43863 case X86ISD::PACKUS: {
43864 SDValue N0 = Op.getOperand(0);
43865 SDValue N1 = Op.getOperand(1);
43866
43867 APInt DemandedLHS, DemandedRHS;
43868 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43869
43870 APInt LHSUndef, LHSZero;
43871 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43872 Depth + 1))
43873 return true;
43874 APInt RHSUndef, RHSZero;
43875 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43876 Depth + 1))
43877 return true;
43878
43879 // TODO - pass on known zero/undef.
43880
43881 // Aggressively peek through ops to get at the demanded elts.
43882 // TODO - we should do this for all target/faux shuffles ops.
43883 if (!DemandedElts.isAllOnes()) {
43884 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43885 TLO.DAG, Depth + 1);
43886 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43887 TLO.DAG, Depth + 1);
43888 if (NewN0 || NewN1) {
43889 NewN0 = NewN0 ? NewN0 : N0;
43890 NewN1 = NewN1 ? NewN1 : N1;
43891 return TLO.CombineTo(Op,
43892 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43893 }
43894 }
43895 break;
43896 }
43897 case X86ISD::HADD:
43898 case X86ISD::HSUB:
43899 case X86ISD::FHADD:
43900 case X86ISD::FHSUB: {
43901 SDValue N0 = Op.getOperand(0);
43902 SDValue N1 = Op.getOperand(1);
43903
43904 APInt DemandedLHS, DemandedRHS;
43905 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43906
43907 APInt LHSUndef, LHSZero;
43908 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43909 Depth + 1))
43910 return true;
43911 APInt RHSUndef, RHSZero;
43912 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43913 Depth + 1))
43914 return true;
43915
43916 // TODO - pass on known zero/undef.
43917
43918 // Aggressively peek through ops to get at the demanded elts.
43919 // TODO: Handle repeated operands.
43920 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43921 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43922 TLO.DAG, Depth + 1);
43923 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43924 TLO.DAG, Depth + 1);
43925 if (NewN0 || NewN1) {
43926 NewN0 = NewN0 ? NewN0 : N0;
43927 NewN1 = NewN1 ? NewN1 : N1;
43928 return TLO.CombineTo(Op,
43929 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43930 }
43931 }
43932 break;
43933 }
43934 case X86ISD::VTRUNC:
43935 case X86ISD::VTRUNCS:
43936 case X86ISD::VTRUNCUS: {
43937 SDValue Src = Op.getOperand(0);
43938 MVT SrcVT = Src.getSimpleValueType();
43939 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43940 APInt SrcUndef, SrcZero;
43941 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43942 Depth + 1))
43943 return true;
43944 KnownZero = SrcZero.zextOrTrunc(NumElts);
43945 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43946 break;
43947 }
43948 case X86ISD::BLENDI: {
43949 SmallVector<int, 16> BlendMask;
43950 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43952 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43953 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43954 return TLO.CombineTo(Op, R);
43955 break;
43956 }
43957 case X86ISD::BLENDV: {
43958 APInt SelUndef, SelZero;
43959 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43960 SelZero, TLO, Depth + 1))
43961 return true;
43962
43963 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43964 APInt LHSUndef, LHSZero;
43965 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43966 LHSZero, TLO, Depth + 1))
43967 return true;
43968
43969 APInt RHSUndef, RHSZero;
43970 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43971 RHSZero, TLO, Depth + 1))
43972 return true;
43973
43974 KnownZero = LHSZero & RHSZero;
43975 KnownUndef = LHSUndef & RHSUndef;
43976 break;
43977 }
43978 case X86ISD::VZEXT_MOVL: {
43979 // If upper demanded elements are already zero then we have nothing to do.
43980 SDValue Src = Op.getOperand(0);
43981 APInt DemandedUpperElts = DemandedElts;
43982 DemandedUpperElts.clearLowBits(1);
43983 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43984 return TLO.CombineTo(Op, Src);
43985 break;
43986 }
43987 case X86ISD::VZEXT_LOAD: {
43988 // If upper demanded elements are not demanded then simplify to a
43989 // scalar_to_vector(load()).
43991 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43992 SDLoc DL(Op);
43993 auto *Mem = cast<MemSDNode>(Op);
43994 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43995 Mem->getMemOperand());
43996 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43997 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43998 }
43999 break;
44000 }
44001 case X86ISD::VBROADCAST: {
44002 SDValue Src = Op.getOperand(0);
44003 MVT SrcVT = Src.getSimpleValueType();
44004 // Don't bother broadcasting if we just need the 0'th element.
44005 if (DemandedElts == 1) {
44006 if (!SrcVT.isVector())
44007 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44008 else if (Src.getValueType() != VT)
44009 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44010 SDLoc(Op));
44011 return TLO.CombineTo(Op, Src);
44012 }
44013 if (!SrcVT.isVector())
44014 break;
44015 APInt SrcUndef, SrcZero;
44016 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44017 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44018 Depth + 1))
44019 return true;
44020 // Aggressively peek through src to get at the demanded elt.
44021 // TODO - we should do this for all target/faux shuffles ops.
44023 Src, SrcElts, TLO.DAG, Depth + 1))
44024 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44025 break;
44026 }
44027 case X86ISD::VPERMV:
44028 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44029 Depth))
44030 return true;
44031 break;
44032 case X86ISD::PSHUFB:
44033 case X86ISD::VPERMV3:
44034 case X86ISD::VPERMILPV:
44035 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44036 Depth))
44037 return true;
44038 break;
44039 case X86ISD::VPPERM:
44040 case X86ISD::VPERMIL2:
44041 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44042 Depth))
44043 return true;
44044 break;
44045 }
44046
44047 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44048 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44049 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44050 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44051 DemandedElts.lshr(NumElts / 2) == 0) {
44052 unsigned SizeInBits = VT.getSizeInBits();
44053 unsigned ExtSizeInBits = SizeInBits / 2;
44054
44055 // See if 512-bit ops only use the bottom 128-bits.
44056 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44057 ExtSizeInBits = SizeInBits / 4;
44058
44059 switch (Opc) {
44060 // Scalar broadcast.
44061 case X86ISD::VBROADCAST: {
44062 SDLoc DL(Op);
44063 SDValue Src = Op.getOperand(0);
44064 if (Src.getValueSizeInBits() > ExtSizeInBits)
44065 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44066 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44067 ExtSizeInBits / VT.getScalarSizeInBits());
44068 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44069 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44070 TLO.DAG, DL, ExtSizeInBits));
44071 }
44073 SDLoc DL(Op);
44074 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44075 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44076 ExtSizeInBits / VT.getScalarSizeInBits());
44077 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44078 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44079 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44080 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44081 MemIntr->getMemOperand());
44083 Bcst.getValue(1));
44084 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44085 TLO.DAG, DL, ExtSizeInBits));
44086 }
44087 // Subvector broadcast.
44089 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44090 EVT MemVT = MemIntr->getMemoryVT();
44091 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44092 SDLoc DL(Op);
44093 SDValue Ld =
44094 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44095 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44097 Ld.getValue(1));
44098 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44099 TLO.DAG, DL, ExtSizeInBits));
44100 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44101 SDLoc DL(Op);
44102 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44103 ExtSizeInBits / VT.getScalarSizeInBits());
44104 if (SDValue BcstLd =
44105 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44106 return TLO.CombineTo(Op,
44107 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44108 TLO.DAG, DL, ExtSizeInBits));
44109 }
44110 break;
44111 }
44112 // Byte shifts by immediate.
44113 case X86ISD::VSHLDQ:
44114 case X86ISD::VSRLDQ:
44115 // Shift by uniform.
44116 case X86ISD::VSHL:
44117 case X86ISD::VSRL:
44118 case X86ISD::VSRA:
44119 // Shift by immediate.
44120 case X86ISD::VSHLI:
44121 case X86ISD::VSRLI:
44122 case X86ISD::VSRAI: {
44123 SDLoc DL(Op);
44124 SDValue Ext0 =
44125 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44126 SDValue ExtOp =
44127 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44128 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44129 SDValue Insert =
44130 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44131 return TLO.CombineTo(Op, Insert);
44132 }
44133 case X86ISD::VPERMI: {
44134 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44135 // TODO: This should be done in shuffle combining.
44136 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44138 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44139 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44140 SDLoc DL(Op);
44141 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44142 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44143 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44144 return TLO.CombineTo(Op, Insert);
44145 }
44146 }
44147 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44148 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44149 SDLoc DL(Op);
44150 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44151 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44152 Op.getOperand(1));
44153 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44154 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44155 return TLO.CombineTo(Op, Insert);
44156 }
44157 break;
44158 }
44159 case X86ISD::VPERMV: {
44162 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44163 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44164 VT == MVT::v16f32) &&
44165 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44166 // For lane-crossing shuffles, only split in half in case we're still
44167 // referencing higher elements.
44168 unsigned HalfElts = NumElts / 2;
44169 unsigned HalfSize = SizeInBits / 2;
44170 Mask.resize(HalfElts);
44171 if (all_of(Mask,
44172 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44174 SDLoc DL(Op);
44175 SDValue Ext;
44176 SDValue M =
44177 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44178 SDValue V =
44179 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44180 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44181 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44182 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44183 else {
44185 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44186 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44187 TLO.DAG.getBitcast(ShufVT, V), M);
44188 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44189 }
44190 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44191 Subtarget, TLO.DAG, DL, SizeInBits);
44192 return TLO.CombineTo(Op, Insert);
44193 }
44194 }
44195 break;
44196 }
44197 case X86ISD::VPERMV3: {
44200 if (Subtarget.hasVLX() &&
44201 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44202 // For lane-crossing shuffles, only split in half in case we're still
44203 // referencing higher elements.
44204 unsigned HalfElts = NumElts / 2;
44205 unsigned HalfSize = SizeInBits / 2;
44206 Mask.resize(HalfElts);
44207 if (all_of(Mask, [&](int M) {
44208 return isUndefOrInRange(M, 0, HalfElts) ||
44209 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44210 })) {
44211 // Adjust mask elements for 2nd operand to point to half width.
44212 for (int &M : Mask)
44213 M = (M < NumElts) ? M : (M - HalfElts);
44215 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44216 SDLoc DL(Op);
44217 SDValue Ext = TLO.DAG.getNode(
44218 Opc, DL, HalfVT,
44219 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44220 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44221 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44222 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44223 Subtarget, TLO.DAG, DL, SizeInBits);
44224 return TLO.CombineTo(Op, Insert);
44225 }
44226 }
44227 break;
44228 }
44229 case X86ISD::VPERM2X128: {
44230 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44231 SDLoc DL(Op);
44232 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44233 if (LoMask & 0x8)
44234 return TLO.CombineTo(
44235 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44236 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44237 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44238 SDValue ExtOp =
44239 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44240 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44241 SDValue Insert =
44242 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44243 return TLO.CombineTo(Op, Insert);
44244 }
44245 // Conversions.
44246 // TODO: Add more CVT opcodes when we have test coverage.
44247 case X86ISD::CVTTP2UI: {
44248 if (!Subtarget.hasVLX())
44249 break;
44250 [[fallthrough]];
44251 }
44252 case X86ISD::CVTTP2SI: {
44253 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44254 !Subtarget.hasVLX())
44255 break;
44256 [[fallthrough]];
44257 }
44258 case X86ISD::CVTPH2PS: {
44259 SDLoc DL(Op);
44260 unsigned Scale = SizeInBits / ExtSizeInBits;
44261 SDValue SrcOp = Op.getOperand(0);
44262 MVT SrcVT = SrcOp.getSimpleValueType();
44263 unsigned SrcExtSize =
44264 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44266 ExtSizeInBits / VT.getScalarSizeInBits());
44267 SDValue ExtOp = TLO.DAG.getNode(
44268 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44269 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44270 SDValue Insert =
44271 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44272 return TLO.CombineTo(Op, Insert);
44273 }
44274 // Zero upper elements.
44275 case X86ISD::VZEXT_MOVL:
44276 // Variable blend.
44277 case X86ISD::BLENDV:
44278 // Target unary shuffles:
44279 case X86ISD::MOVDDUP:
44280 // Target unary shuffles by immediate:
44281 case X86ISD::PSHUFD:
44282 case X86ISD::PSHUFLW:
44283 case X86ISD::PSHUFHW:
44284 case X86ISD::VPERMILPI:
44285 // (Non-Lane Crossing) Target Shuffles.
44286 case X86ISD::VPERMILPV:
44287 case X86ISD::VPERMIL2:
44288 case X86ISD::PSHUFB:
44289 case X86ISD::UNPCKL:
44290 case X86ISD::UNPCKH:
44291 case X86ISD::BLENDI:
44292 // Integer ops.
44293 case X86ISD::PACKSS:
44294 case X86ISD::PACKUS:
44295 case X86ISD::PCMPEQ:
44296 case X86ISD::PCMPGT:
44297 case X86ISD::PMULUDQ:
44298 case X86ISD::PMULDQ:
44299 case X86ISD::VSHLV:
44300 case X86ISD::VSRLV:
44301 case X86ISD::VSRAV:
44302 // Float ops.
44303 case X86ISD::FMAX:
44304 case X86ISD::FMIN:
44305 case X86ISD::FMAXC:
44306 case X86ISD::FMINC:
44307 case X86ISD::FRSQRT:
44308 case X86ISD::FRCP:
44309 // Horizontal Ops.
44310 case X86ISD::HADD:
44311 case X86ISD::HSUB:
44312 case X86ISD::FHADD:
44313 case X86ISD::FHSUB: {
44314 SDLoc DL(Op);
44316 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44317 SDValue SrcOp = Op.getOperand(i);
44318 EVT SrcVT = SrcOp.getValueType();
44319 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44320 "Unsupported vector size");
44321 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44322 ExtSizeInBits)
44323 : SrcOp);
44324 }
44325 MVT ExtVT = VT.getSimpleVT();
44326 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44327 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44328 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44329 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44330 SDValue Insert =
44331 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44332 return TLO.CombineTo(Op, Insert);
44333 }
44334 }
44335 }
44336
44337 // For splats, unless we *only* demand the 0'th element,
44338 // stop attempts at simplification here, we aren't going to improve things,
44339 // this is better than any potential shuffle.
44340 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44341 return false;
44342
44343 // Get target/faux shuffle mask.
44344 APInt OpUndef, OpZero;
44345 SmallVector<int, 64> OpMask;
44346 SmallVector<SDValue, 2> OpInputs;
44347 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44348 OpZero, TLO.DAG, Depth, false))
44349 return false;
44350
44351 // Shuffle inputs must be the same size as the result.
44352 if (OpMask.size() != (unsigned)NumElts ||
44353 llvm::any_of(OpInputs, [VT](SDValue V) {
44354 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44355 !V.getValueType().isVector();
44356 }))
44357 return false;
44358
44359 KnownZero = OpZero;
44360 KnownUndef = OpUndef;
44361
44362 // Check if shuffle mask can be simplified to undef/zero/identity.
44363 int NumSrcs = OpInputs.size();
44364 for (int i = 0; i != NumElts; ++i)
44365 if (!DemandedElts[i])
44366 OpMask[i] = SM_SentinelUndef;
44367
44368 if (isUndefInRange(OpMask, 0, NumElts)) {
44369 KnownUndef.setAllBits();
44370 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44371 }
44372 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44373 KnownZero.setAllBits();
44374 return TLO.CombineTo(
44375 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44376 }
44377 for (int Src = 0; Src != NumSrcs; ++Src)
44378 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44379 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44380
44381 // Attempt to simplify inputs.
44382 for (int Src = 0; Src != NumSrcs; ++Src) {
44383 // TODO: Support inputs of different types.
44384 if (OpInputs[Src].getValueType() != VT)
44385 continue;
44386
44387 int Lo = Src * NumElts;
44388 APInt SrcElts = APInt::getZero(NumElts);
44389 for (int i = 0; i != NumElts; ++i)
44390 if (DemandedElts[i]) {
44391 int M = OpMask[i] - Lo;
44392 if (0 <= M && M < NumElts)
44393 SrcElts.setBit(M);
44394 }
44395
44396 // TODO - Propagate input undef/zero elts.
44397 APInt SrcUndef, SrcZero;
44398 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44399 TLO, Depth + 1))
44400 return true;
44401 }
44402
44403 // If we don't demand all elements, then attempt to combine to a simpler
44404 // shuffle.
44405 // We need to convert the depth to something combineX86ShufflesRecursively
44406 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44407 // to match. This prevents combineX86ShuffleChain from returning a
44408 // combined shuffle that's the same as the original root, causing an
44409 // infinite loop.
44410 if (!DemandedElts.isAllOnes()) {
44411 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44412
44413 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44414 for (int i = 0; i != NumElts; ++i)
44415 if (DemandedElts[i])
44416 DemandedMask[i] = i;
44417
44419 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44421 /*AllowVariableCrossLaneMask=*/true,
44422 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44423 TLO.DAG, SDLoc(Op), Subtarget);
44424 if (NewShuffle)
44425 return TLO.CombineTo(Op, NewShuffle);
44426 }
44427
44428 return false;
44429}
44430
44432 SDValue Op, const APInt &OriginalDemandedBits,
44433 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44434 unsigned Depth) const {
44435 EVT VT = Op.getValueType();
44436 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44437 unsigned Opc = Op.getOpcode();
44438 switch(Opc) {
44439 case X86ISD::VTRUNC: {
44440 KnownBits KnownOp;
44441 SDValue Src = Op.getOperand(0);
44442 MVT SrcVT = Src.getSimpleValueType();
44443
44444 // Simplify the input, using demanded bit information.
44445 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44446 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44447 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44448 return true;
44449 break;
44450 }
44451 case X86ISD::PMULDQ:
44452 case X86ISD::PMULUDQ: {
44453 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44454 KnownBits KnownLHS, KnownRHS;
44455 SDValue LHS = Op.getOperand(0);
44456 SDValue RHS = Op.getOperand(1);
44457
44458 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44459 // FIXME: Can we bound this better?
44460 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44461 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44462 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44463
44464 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44465 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44466 DemandedMaskLHS = DemandedMask;
44467 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44468 DemandedMaskRHS = DemandedMask;
44469
44470 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44471 KnownLHS, TLO, Depth + 1))
44472 return true;
44473 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44474 KnownRHS, TLO, Depth + 1))
44475 return true;
44476
44477 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44478 KnownRHS = KnownRHS.trunc(32);
44479 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44480 KnownRHS.getConstant().isOne()) {
44481 SDLoc DL(Op);
44482 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44483 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44484 }
44485
44486 // Aggressively peek through ops to get at the demanded low bits.
44488 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44490 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44491 if (DemandedLHS || DemandedRHS) {
44492 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44493 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44494 return TLO.CombineTo(
44495 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44496 }
44497 break;
44498 }
44499 case X86ISD::ANDNP: {
44500 KnownBits Known2;
44501 SDValue Op0 = Op.getOperand(0);
44502 SDValue Op1 = Op.getOperand(1);
44503
44504 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44505 Known, TLO, Depth + 1))
44506 return true;
44507
44508 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44509 OriginalDemandedElts, Known2, TLO, Depth + 1))
44510 return true;
44511
44512 // If the RHS is a constant, see if we can simplify it.
44513 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44514 OriginalDemandedElts, TLO))
44515 return true;
44516
44517 // ANDNP = (~Op0 & Op1);
44518 Known.One &= Known2.Zero;
44519 Known.Zero |= Known2.One;
44520 break;
44521 }
44522 case X86ISD::VSHLI: {
44523 SDValue Op0 = Op.getOperand(0);
44524 SDValue Op1 = Op.getOperand(1);
44525
44526 unsigned ShAmt = Op1->getAsZExtVal();
44527 if (ShAmt >= BitWidth)
44528 break;
44529
44530 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44531
44532 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44533 // single shift. We can do this if the bottom bits (which are shifted
44534 // out) are never demanded.
44535 if (Op0.getOpcode() == X86ISD::VSRLI &&
44536 OriginalDemandedBits.countr_zero() >= ShAmt) {
44537 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44538 if (Shift2Amt < BitWidth) {
44539 int Diff = ShAmt - Shift2Amt;
44540 if (Diff == 0)
44541 return TLO.CombineTo(Op, Op0.getOperand(0));
44542
44543 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44544 SDValue NewShift = TLO.DAG.getNode(
44545 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44546 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44547 return TLO.CombineTo(Op, NewShift);
44548 }
44549 }
44550
44551 // If we are only demanding sign bits then we can use the shift source directly.
44552 unsigned NumSignBits =
44553 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44554 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44555 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44556 return TLO.CombineTo(Op, Op0);
44557
44558 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44559 TLO, Depth + 1))
44560 return true;
44561
44562 Known <<= ShAmt;
44563
44564 // Low bits known zero.
44565 Known.Zero.setLowBits(ShAmt);
44566
44567 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44568 // Attempt to avoid multi-use ops if we don't need anything from them.
44569 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44570 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44571 SDValue NewOp =
44572 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44573 return TLO.CombineTo(Op, NewOp);
44574 }
44575 }
44576 return false;
44577 }
44578 case X86ISD::VSRLI: {
44579 SDValue Op0 = Op.getOperand(0);
44580 SDValue Op1 = Op.getOperand(1);
44581
44582 unsigned ShAmt = Op1->getAsZExtVal();
44583 if (ShAmt >= BitWidth)
44584 break;
44585
44586 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44587
44588 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44589 TLO, Depth + 1))
44590 return true;
44591
44592 Known >>= ShAmt;
44593
44594 // High bits known zero.
44595 Known.Zero.setHighBits(ShAmt);
44596
44597 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44598 // Attempt to avoid multi-use ops if we don't need anything from them.
44599 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44600 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44601 SDValue NewOp =
44602 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44603 return TLO.CombineTo(Op, NewOp);
44604 }
44605 }
44606 return false;
44607 }
44608 case X86ISD::VSRAI: {
44609 SDValue Op0 = Op.getOperand(0);
44610 SDValue Op1 = Op.getOperand(1);
44611
44612 unsigned ShAmt = Op1->getAsZExtVal();
44613 if (ShAmt >= BitWidth)
44614 break;
44615
44616 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44617
44618 // If we just want the sign bit then we don't need to shift it.
44619 if (OriginalDemandedBits.isSignMask())
44620 return TLO.CombineTo(Op, Op0);
44621
44622 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44623 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44624 SDValue Op00 = Op0.getOperand(0);
44625 unsigned NumSignBits =
44626 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44627 if (ShAmt < NumSignBits)
44628 return TLO.CombineTo(Op, Op00);
44629 }
44630
44631 // If any of the demanded bits are produced by the sign extension, we also
44632 // demand the input sign bit.
44633 if (OriginalDemandedBits.countl_zero() < ShAmt)
44634 DemandedMask.setSignBit();
44635
44636 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44637 TLO, Depth + 1))
44638 return true;
44639
44640 Known >>= ShAmt;
44641
44642 // If the input sign bit is known to be zero, or if none of the top bits
44643 // are demanded, turn this into an unsigned shift right.
44644 if (Known.Zero[BitWidth - ShAmt - 1] ||
44645 OriginalDemandedBits.countl_zero() >= ShAmt)
44646 return TLO.CombineTo(
44647 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44648
44649 // High bits are known one.
44650 if (Known.One[BitWidth - ShAmt - 1])
44651 Known.One.setHighBits(ShAmt);
44652
44653 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44654 // Attempt to avoid multi-use ops if we don't need anything from them.
44655 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44656 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44657 SDValue NewOp =
44658 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44659 return TLO.CombineTo(Op, NewOp);
44660 }
44661 }
44662 return false;
44663 }
44664 case X86ISD::BLENDI: {
44665 SDValue LHS = Op.getOperand(0);
44666 SDValue RHS = Op.getOperand(1);
44667 APInt Mask = getBLENDIBlendMask(Op);
44668
44669 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44670 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44671 TLO, Depth + 1))
44672 return true;
44673
44674 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44675 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44676 TLO, Depth + 1))
44677 return true;
44678
44679 // Attempt to avoid multi-use ops if we don't need anything from them.
44681 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44683 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44684 if (NewLHS || NewRHS) {
44685 NewLHS = NewLHS ? NewLHS : LHS;
44686 NewRHS = NewRHS ? NewRHS : RHS;
44687 return TLO.CombineTo(Op,
44688 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44689 NewLHS, NewRHS, Op.getOperand(2)));
44690 }
44691 break;
44692 }
44693 case X86ISD::BLENDV: {
44694 SDValue Sel = Op.getOperand(0);
44695 SDValue LHS = Op.getOperand(1);
44696 SDValue RHS = Op.getOperand(2);
44697
44698 APInt SignMask = APInt::getSignMask(BitWidth);
44700 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44702 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44704 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44705
44706 if (NewSel || NewLHS || NewRHS) {
44707 NewSel = NewSel ? NewSel : Sel;
44708 NewLHS = NewLHS ? NewLHS : LHS;
44709 NewRHS = NewRHS ? NewRHS : RHS;
44710 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44711 NewSel, NewLHS, NewRHS));
44712 }
44713 break;
44714 }
44715 case X86ISD::PEXTRB:
44716 case X86ISD::PEXTRW: {
44717 SDValue Vec = Op.getOperand(0);
44718 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44719 MVT VecVT = Vec.getSimpleValueType();
44720 unsigned NumVecElts = VecVT.getVectorNumElements();
44721
44722 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44723 unsigned Idx = CIdx->getZExtValue();
44724 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44725
44726 // If we demand no bits from the vector then we must have demanded
44727 // bits from the implict zext - simplify to zero.
44728 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44729 if (DemandedVecBits == 0)
44730 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44731
44732 APInt KnownUndef, KnownZero;
44733 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44734 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44735 KnownZero, TLO, Depth + 1))
44736 return true;
44737
44738 KnownBits KnownVec;
44739 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44740 KnownVec, TLO, Depth + 1))
44741 return true;
44742
44744 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44745 return TLO.CombineTo(
44746 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44747
44748 Known = KnownVec.zext(BitWidth);
44749 return false;
44750 }
44751 break;
44752 }
44753 case X86ISD::PINSRB:
44754 case X86ISD::PINSRW: {
44755 SDValue Vec = Op.getOperand(0);
44756 SDValue Scl = Op.getOperand(1);
44757 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44758 MVT VecVT = Vec.getSimpleValueType();
44759
44760 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44761 unsigned Idx = CIdx->getZExtValue();
44762 if (!OriginalDemandedElts[Idx])
44763 return TLO.CombineTo(Op, Vec);
44764
44765 KnownBits KnownVec;
44766 APInt DemandedVecElts(OriginalDemandedElts);
44767 DemandedVecElts.clearBit(Idx);
44768 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44769 KnownVec, TLO, Depth + 1))
44770 return true;
44771
44772 KnownBits KnownScl;
44773 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44774 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44775 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44776 return true;
44777
44778 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44779 Known = KnownVec.intersectWith(KnownScl);
44780 return false;
44781 }
44782 break;
44783 }
44784 case X86ISD::PACKSS:
44785 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44786 // sign bit then we can just ask for the source operands sign bit.
44787 // TODO - add known bits handling.
44788 if (OriginalDemandedBits.isSignMask()) {
44789 APInt DemandedLHS, DemandedRHS;
44790 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44791
44792 KnownBits KnownLHS, KnownRHS;
44793 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44794 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44795 KnownLHS, TLO, Depth + 1))
44796 return true;
44797 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44798 KnownRHS, TLO, Depth + 1))
44799 return true;
44800
44801 // Attempt to avoid multi-use ops if we don't need anything from them.
44803 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44805 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44806 if (DemandedOp0 || DemandedOp1) {
44807 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44808 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44809 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44810 }
44811 }
44812 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44813 break;
44814 case X86ISD::VBROADCAST: {
44815 SDValue Src = Op.getOperand(0);
44816 MVT SrcVT = Src.getSimpleValueType();
44817 APInt DemandedElts = APInt::getOneBitSet(
44818 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44819 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44820 TLO, Depth + 1))
44821 return true;
44822 // If we don't need the upper bits, attempt to narrow the broadcast source.
44823 // Don't attempt this on AVX512 as it might affect broadcast folding.
44824 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44825 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44826 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44827 Src->hasOneUse()) {
44828 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44829 SDValue NewSrc =
44830 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44831 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44832 SDValue NewBcst =
44833 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44834 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44835 }
44836 break;
44837 }
44838 case X86ISD::PCMPGT:
44839 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44840 // iff we only need the sign bit then we can use R directly.
44841 if (OriginalDemandedBits.isSignMask() &&
44842 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44843 return TLO.CombineTo(Op, Op.getOperand(1));
44844 break;
44845 case X86ISD::MOVMSK: {
44846 SDValue Src = Op.getOperand(0);
44847 MVT SrcVT = Src.getSimpleValueType();
44848 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44849 unsigned NumElts = SrcVT.getVectorNumElements();
44850
44851 // If we don't need the sign bits at all just return zero.
44852 if (OriginalDemandedBits.countr_zero() >= NumElts)
44853 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44854
44855 // See if we only demand bits from the lower 128-bit vector.
44856 if (SrcVT.is256BitVector() &&
44857 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44858 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44859 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44860 }
44861
44862 // Only demand the vector elements of the sign bits we need.
44863 APInt KnownUndef, KnownZero;
44864 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44865 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44866 TLO, Depth + 1))
44867 return true;
44868
44869 Known.Zero = KnownZero.zext(BitWidth);
44870 Known.Zero.setHighBits(BitWidth - NumElts);
44871
44872 // MOVMSK only uses the MSB from each vector element.
44873 KnownBits KnownSrc;
44874 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44875 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44876 Depth + 1))
44877 return true;
44878
44879 if (KnownSrc.One[SrcBits - 1])
44880 Known.One.setLowBits(NumElts);
44881 else if (KnownSrc.Zero[SrcBits - 1])
44882 Known.Zero.setLowBits(NumElts);
44883
44884 // Attempt to avoid multi-use os if we don't need anything from it.
44886 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44887 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44888 return false;
44889 }
44890 case X86ISD::TESTP: {
44891 SDValue Op0 = Op.getOperand(0);
44892 SDValue Op1 = Op.getOperand(1);
44893 MVT OpVT = Op0.getSimpleValueType();
44894 assert((OpVT.getVectorElementType() == MVT::f32 ||
44895 OpVT.getVectorElementType() == MVT::f64) &&
44896 "Illegal vector type for X86ISD::TESTP");
44897
44898 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44899 KnownBits KnownSrc;
44900 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44901 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44902 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44903 AssumeSingleUse) ||
44904 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44905 AssumeSingleUse);
44906 }
44907 case X86ISD::CMOV: {
44908 KnownBits Known2;
44909 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44910 OriginalDemandedElts, Known2, TLO, Depth + 1))
44911 return true;
44912 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44913 OriginalDemandedElts, Known, TLO, Depth + 1))
44914 return true;
44915
44916 // Only known if known in both the LHS and RHS.
44917 Known = Known.intersectWith(Known2);
44918 return false;
44919 }
44920 case X86ISD::BEXTR:
44921 case X86ISD::BEXTRI: {
44922 SDValue Op0 = Op.getOperand(0);
44923 SDValue Op1 = Op.getOperand(1);
44924
44925 // Only bottom 16-bits of the control bits are required.
44926 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44927 // NOTE: SimplifyDemandedBits won't do this for constants.
44928 uint64_t Val1 = Cst1->getZExtValue();
44929 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44930 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44931 SDLoc DL(Op);
44932 return TLO.CombineTo(
44933 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44934 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44935 }
44936
44937 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44938 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44939
44940 // If the length is 0, the result is 0.
44941 if (Length == 0) {
44942 Known.setAllZero();
44943 return false;
44944 }
44945
44946 if ((Shift + Length) <= BitWidth) {
44947 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44948 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44949 return true;
44950
44951 Known = Known.extractBits(Length, Shift);
44952 Known = Known.zextOrTrunc(BitWidth);
44953 return false;
44954 }
44955 } else {
44956 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44957 KnownBits Known1;
44958 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44959 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44960 return true;
44961
44962 // If the length is 0, replace with 0.
44963 KnownBits LengthBits = Known1.extractBits(8, 8);
44964 if (LengthBits.isZero())
44965 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44966 }
44967
44968 break;
44969 }
44970 case X86ISD::PDEP: {
44971 SDValue Op0 = Op.getOperand(0);
44972 SDValue Op1 = Op.getOperand(1);
44973
44974 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44975 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44976
44977 // If the demanded bits has leading zeroes, we don't demand those from the
44978 // mask.
44979 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44980 return true;
44981
44982 // The number of possible 1s in the mask determines the number of LSBs of
44983 // operand 0 used. Undemanded bits from the mask don't matter so filter
44984 // them before counting.
44985 KnownBits Known2;
44986 uint64_t Count = (~Known.Zero & LoMask).popcount();
44987 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44988 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44989 return true;
44990
44991 // Zeroes are retained from the mask, but not ones.
44992 Known.One.clearAllBits();
44993 // The result will have at least as many trailing zeros as the non-mask
44994 // operand since bits can only map to the same or higher bit position.
44995 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44996 return false;
44997 }
44998 case X86ISD::VPMADD52L:
44999 case X86ISD::VPMADD52H: {
45000 KnownBits KnownOp0, KnownOp1, KnownOp2;
45001 SDValue Op0 = Op.getOperand(0);
45002 SDValue Op1 = Op.getOperand(1);
45003 SDValue Op2 = Op.getOperand(2);
45004 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45005 // operand 2).
45006 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45007 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45008 TLO, Depth + 1))
45009 return true;
45010
45011 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45012 TLO, Depth + 1))
45013 return true;
45014
45015 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45016 KnownOp2, TLO, Depth + 1))
45017 return true;
45018
45019 KnownBits KnownMul;
45020 KnownOp0 = KnownOp0.trunc(52);
45021 KnownOp1 = KnownOp1.trunc(52);
45022 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45023 : KnownBits::mulhu(KnownOp0, KnownOp1);
45024 KnownMul = KnownMul.zext(64);
45025
45026 // lo/hi(X * Y) + Z --> C + Z
45027 if (KnownMul.isConstant()) {
45028 SDLoc DL(Op);
45029 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45030 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45031 }
45032
45033 Known = KnownBits::add(KnownMul, KnownOp2);
45034 return false;
45035 }
45036 }
45037
45039 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45040}
45041
45043 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45044 SelectionDAG &DAG, unsigned Depth) const {
45045 int NumElts = DemandedElts.getBitWidth();
45046 unsigned Opc = Op.getOpcode();
45047 EVT VT = Op.getValueType();
45048
45049 switch (Opc) {
45050 case X86ISD::PINSRB:
45051 case X86ISD::PINSRW: {
45052 // If we don't demand the inserted element, return the base vector.
45053 SDValue Vec = Op.getOperand(0);
45054 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45055 MVT VecVT = Vec.getSimpleValueType();
45056 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45057 !DemandedElts[CIdx->getZExtValue()])
45058 return Vec;
45059 break;
45060 }
45061 case X86ISD::VSHLI: {
45062 // If we are only demanding sign bits then we can use the shift source
45063 // directly.
45064 SDValue Op0 = Op.getOperand(0);
45065 unsigned ShAmt = Op.getConstantOperandVal(1);
45066 unsigned BitWidth = DemandedBits.getBitWidth();
45067 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45068 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45069 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45070 return Op0;
45071 break;
45072 }
45073 case X86ISD::VSRAI:
45074 // iff we only need the sign bit then we can use the source directly.
45075 // TODO: generalize where we only demand extended signbits.
45076 if (DemandedBits.isSignMask())
45077 return Op.getOperand(0);
45078 break;
45079 case X86ISD::PCMPGT:
45080 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45081 // iff we only need the sign bit then we can use R directly.
45082 if (DemandedBits.isSignMask() &&
45083 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45084 return Op.getOperand(1);
45085 break;
45086 case X86ISD::BLENDV: {
45087 // BLENDV: Cond (MSB) ? LHS : RHS
45088 SDValue Cond = Op.getOperand(0);
45089 SDValue LHS = Op.getOperand(1);
45090 SDValue RHS = Op.getOperand(2);
45091
45092 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45093 if (CondKnown.isNegative())
45094 return LHS;
45095 if (CondKnown.isNonNegative())
45096 return RHS;
45097 break;
45098 }
45099 case X86ISD::ANDNP: {
45100 // ANDNP = (~LHS & RHS);
45101 SDValue LHS = Op.getOperand(0);
45102 SDValue RHS = Op.getOperand(1);
45103
45104 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45105 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45106
45107 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45108 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45109 // this context, so return RHS.
45110 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45111 return RHS;
45112 break;
45113 }
45114 }
45115
45116 APInt ShuffleUndef, ShuffleZero;
45117 SmallVector<int, 16> ShuffleMask;
45119 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45120 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45121 // If all the demanded elts are from one operand and are inline,
45122 // then we can use the operand directly.
45123 int NumOps = ShuffleOps.size();
45124 if (ShuffleMask.size() == (unsigned)NumElts &&
45126 return VT.getSizeInBits() == V.getValueSizeInBits();
45127 })) {
45128
45129 if (DemandedElts.isSubsetOf(ShuffleUndef))
45130 return DAG.getUNDEF(VT);
45131 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45132 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45133
45134 // Bitmask that indicates which ops have only been accessed 'inline'.
45135 APInt IdentityOp = APInt::getAllOnes(NumOps);
45136 for (int i = 0; i != NumElts; ++i) {
45137 int M = ShuffleMask[i];
45138 if (!DemandedElts[i] || ShuffleUndef[i])
45139 continue;
45140 int OpIdx = M / NumElts;
45141 int EltIdx = M % NumElts;
45142 if (M < 0 || EltIdx != i) {
45143 IdentityOp.clearAllBits();
45144 break;
45145 }
45146 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45147 if (IdentityOp == 0)
45148 break;
45149 }
45150 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45151 "Multiple identity shuffles detected");
45152
45153 if (IdentityOp != 0)
45154 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45155 }
45156 }
45157
45159 Op, DemandedBits, DemandedElts, DAG, Depth);
45160}
45161
45163 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45164 bool PoisonOnly, unsigned Depth) const {
45165 unsigned NumElts = DemandedElts.getBitWidth();
45166
45167 switch (Op.getOpcode()) {
45169 case X86ISD::Wrapper:
45170 case X86ISD::WrapperRIP:
45171 return true;
45172 case X86ISD::INSERTPS:
45173 case X86ISD::BLENDI:
45174 case X86ISD::PSHUFB:
45175 case X86ISD::PSHUFD:
45176 case X86ISD::UNPCKL:
45177 case X86ISD::UNPCKH:
45178 case X86ISD::VPERMILPV:
45179 case X86ISD::VPERMILPI:
45180 case X86ISD::VPERMV:
45181 case X86ISD::VPERMV3: {
45184 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45185 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45186 APInt::getZero(NumElts));
45187 for (auto M : enumerate(Mask)) {
45188 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45189 continue;
45190 if (M.value() == SM_SentinelUndef)
45191 return false;
45192 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45193 "Shuffle mask index out of range");
45194 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45195 }
45196 for (auto Op : enumerate(Ops))
45197 if (!DemandedSrcElts[Op.index()].isZero() &&
45199 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45200 return false;
45201 return true;
45202 }
45203 break;
45204 }
45205 }
45207 Op, DemandedElts, DAG, PoisonOnly, Depth);
45208}
45209
45211 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45212 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45213
45214 switch (Op.getOpcode()) {
45215 // SSE bit logic.
45216 case X86ISD::FAND:
45217 case X86ISD::FOR:
45218 case X86ISD::FXOR:
45219 case X86ISD::FANDN:
45220 case X86ISD::ANDNP:
45221 case X86ISD::VPTERNLOG:
45222 return false;
45223 // SSE vector insert/extracts use modulo indices.
45224 case X86ISD::PINSRB:
45225 case X86ISD::PINSRW:
45226 case X86ISD::PEXTRB:
45227 case X86ISD::PEXTRW:
45228 return false;
45229 // SSE vector multiplies are either inbounds or saturate.
45230 case X86ISD::VPMADDUBSW:
45231 case X86ISD::VPMADDWD:
45232 return false;
45233 // SSE vector shifts handle out of bounds shift amounts.
45234 case X86ISD::VSHLI:
45235 case X86ISD::VSRLI:
45236 case X86ISD::VSRAI:
45237 return false;
45238 // SSE blends.
45239 case X86ISD::BLENDI:
45240 case X86ISD::BLENDV:
45241 return false;
45242 // SSE target shuffles.
45243 case X86ISD::INSERTPS:
45244 case X86ISD::PSHUFB:
45245 case X86ISD::PSHUFD:
45246 case X86ISD::UNPCKL:
45247 case X86ISD::UNPCKH:
45248 case X86ISD::VPERMILPV:
45249 case X86ISD::VPERMILPI:
45250 case X86ISD::VPERMV:
45251 case X86ISD::VPERMV3:
45252 return false;
45253 // SSE comparisons handle all icmp/fcmp cases.
45254 // TODO: Add CMPM/MM with test coverage.
45255 case X86ISD::CMPP:
45256 case X86ISD::PCMPEQ:
45257 case X86ISD::PCMPGT:
45258 return false;
45259 // SSE signbit extraction.
45260 case X86ISD::MOVMSK:
45261 return false;
45262 // GFNI instructions.
45265 case X86ISD::GF2P8MULB:
45266 return false;
45268 switch (Op->getConstantOperandVal(0)) {
45269 case Intrinsic::x86_sse2_pmadd_wd:
45270 case Intrinsic::x86_avx2_pmadd_wd:
45271 case Intrinsic::x86_avx512_pmaddw_d_512:
45272 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45273 case Intrinsic::x86_avx2_pmadd_ub_sw:
45274 case Intrinsic::x86_avx512_pmaddubs_w_512:
45275 return false;
45276 case Intrinsic::x86_avx512_vpermi2var_d_128:
45277 case Intrinsic::x86_avx512_vpermi2var_d_256:
45278 case Intrinsic::x86_avx512_vpermi2var_d_512:
45279 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45280 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45281 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45282 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45283 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45284 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45285 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45286 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45287 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45288 case Intrinsic::x86_avx512_vpermi2var_q_128:
45289 case Intrinsic::x86_avx512_vpermi2var_q_256:
45290 case Intrinsic::x86_avx512_vpermi2var_q_512:
45291 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45292 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45293 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45294 return false;
45295 }
45296 }
45298 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45299}
45300
45302 const APInt &DemandedElts,
45303 APInt &UndefElts,
45304 const SelectionDAG &DAG,
45305 unsigned Depth) const {
45306 unsigned NumElts = DemandedElts.getBitWidth();
45307 unsigned Opc = Op.getOpcode();
45308
45309 switch (Opc) {
45310 case X86ISD::VBROADCAST:
45312 UndefElts = APInt::getZero(NumElts);
45313 return true;
45314 }
45315
45316 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45317 DAG, Depth);
45318}
45319
45320// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45321// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45322static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45323 bool AllowTruncate, unsigned Depth) {
45324 // Limit recursion.
45326 return false;
45327 switch (Src.getOpcode()) {
45328 case ISD::TRUNCATE:
45329 if (!AllowTruncate)
45330 return false;
45331 [[fallthrough]];
45332 case ISD::SETCC:
45333 return Src.getOperand(0).getValueSizeInBits() == Size;
45334 case ISD::FREEZE:
45335 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45336 Depth + 1);
45337 case ISD::AND:
45338 case ISD::XOR:
45339 case ISD::OR:
45340 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45341 Depth + 1) &&
45342 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45343 Depth + 1);
45344 case ISD::SELECT:
45345 case ISD::VSELECT:
45346 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45347 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45348 Depth + 1) &&
45349 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45350 Depth + 1);
45351 case ISD::BUILD_VECTOR:
45352 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45353 ISD::isBuildVectorAllOnes(Src.getNode());
45354 }
45355 return false;
45356}
45357
45358// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45359static unsigned getAltBitOpcode(unsigned Opcode) {
45360 switch(Opcode) {
45361 // clang-format off
45362 case ISD::AND: return X86ISD::FAND;
45363 case ISD::OR: return X86ISD::FOR;
45364 case ISD::XOR: return X86ISD::FXOR;
45365 case X86ISD::ANDNP: return X86ISD::FANDN;
45366 // clang-format on
45367 }
45368 llvm_unreachable("Unknown bitwise opcode");
45369}
45370
45371// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45373 const SDLoc &DL) {
45374 EVT SrcVT = Src.getValueType();
45375 if (SrcVT != MVT::v4i1)
45376 return SDValue();
45377
45378 switch (Src.getOpcode()) {
45379 case ISD::SETCC:
45380 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45381 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45382 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45383 SDValue Op0 = Src.getOperand(0);
45384 if (ISD::isNormalLoad(Op0.getNode()))
45385 return DAG.getBitcast(MVT::v4f32, Op0);
45386 if (Op0.getOpcode() == ISD::BITCAST &&
45387 Op0.getOperand(0).getValueType() == MVT::v4f32)
45388 return Op0.getOperand(0);
45389 }
45390 break;
45391 case ISD::AND:
45392 case ISD::XOR:
45393 case ISD::OR: {
45394 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45395 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45396 if (Op0 && Op1)
45397 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45398 Op1);
45399 break;
45400 }
45401 }
45402 return SDValue();
45403}
45404
45405// Helper to push sign extension of vXi1 SETCC result through bitops.
45407 SDValue Src, const SDLoc &DL) {
45408 switch (Src.getOpcode()) {
45409 case ISD::SETCC:
45410 case ISD::FREEZE:
45411 case ISD::TRUNCATE:
45412 case ISD::BUILD_VECTOR:
45413 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45414 case ISD::AND:
45415 case ISD::XOR:
45416 case ISD::OR:
45417 return DAG.getNode(
45418 Src.getOpcode(), DL, SExtVT,
45419 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45420 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45421 case ISD::SELECT:
45422 case ISD::VSELECT:
45423 return DAG.getSelect(
45424 DL, SExtVT, Src.getOperand(0),
45425 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45426 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45427 }
45428 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45429}
45430
45431// Try to match patterns such as
45432// (i16 bitcast (v16i1 x))
45433// ->
45434// (i16 movmsk (16i8 sext (v16i1 x)))
45435// before the illegal vector is scalarized on subtargets that don't have legal
45436// vxi1 types.
45438 const SDLoc &DL,
45439 const X86Subtarget &Subtarget) {
45440 EVT SrcVT = Src.getValueType();
45441 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45442 return SDValue();
45443
45444 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45445 // legalization destroys the v4i32 type.
45446 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45447 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45448 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45449 DAG.getBitcast(MVT::v4f32, V));
45450 return DAG.getZExtOrTrunc(V, DL, VT);
45451 }
45452 }
45453
45454 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45455 // movmskb even with avx512. This will be better than truncating to vXi1 and
45456 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45457 // vpcmpeqb/vpcmpgtb.
45458 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45459 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45460 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45461 Src.getOperand(0).getValueType() == MVT::v64i8);
45462
45463 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45464 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45465 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45466 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45467 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45468 EVT CmpVT = Src.getOperand(0).getValueType();
45469 EVT EltVT = CmpVT.getVectorElementType();
45470 if (CmpVT.getSizeInBits() <= 256 &&
45471 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45472 PreferMovMsk = true;
45473 }
45474
45475 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45476 // MOVMSK is supported in SSE2 or later.
45477 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45478 return SDValue();
45479
45480 // If the upper ops of a concatenation are undef, then try to bitcast the
45481 // lower op and extend.
45482 SmallVector<SDValue, 4> SubSrcOps;
45483 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45484 SubSrcOps.size() >= 2) {
45485 SDValue LowerOp = SubSrcOps[0];
45486 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45487 if (LowerOp.getOpcode() == ISD::SETCC &&
45488 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45489 EVT SubVT = VT.getIntegerVT(
45490 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45491 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45492 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45493 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45494 }
45495 }
45496 }
45497
45498 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45499 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45500 // v8i16 and v16i16.
45501 // For these two cases, we can shuffle the upper element bytes to a
45502 // consecutive sequence at the start of the vector and treat the results as
45503 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45504 // for v16i16 this is not the case, because the shuffle is expensive, so we
45505 // avoid sign-extending to this type entirely.
45506 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45507 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45508 MVT SExtVT;
45509 bool PropagateSExt = false;
45510 switch (SrcVT.getSimpleVT().SimpleTy) {
45511 default:
45512 return SDValue();
45513 case MVT::v2i1:
45514 SExtVT = MVT::v2i64;
45515 break;
45516 case MVT::v4i1:
45517 SExtVT = MVT::v4i32;
45518 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45519 // sign-extend to a 256-bit operation to avoid truncation.
45520 if (Subtarget.hasAVX() &&
45521 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45522 SExtVT = MVT::v4i64;
45523 PropagateSExt = true;
45524 }
45525 break;
45526 case MVT::v8i1:
45527 SExtVT = MVT::v8i16;
45528 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45529 // sign-extend to a 256-bit operation to match the compare.
45530 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45531 // 256-bit because the shuffle is cheaper than sign extending the result of
45532 // the compare.
45533 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45534 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45535 SExtVT = MVT::v8i32;
45536 PropagateSExt = true;
45537 }
45538 break;
45539 case MVT::v16i1:
45540 SExtVT = MVT::v16i8;
45541 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45542 // it is not profitable to sign-extend to 256-bit because this will
45543 // require an extra cross-lane shuffle which is more expensive than
45544 // truncating the result of the compare to 128-bits.
45545 break;
45546 case MVT::v32i1:
45547 SExtVT = MVT::v32i8;
45548 break;
45549 case MVT::v64i1:
45550 // If we have AVX512F, but not AVX512BW and the input is truncated from
45551 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45552 if (Subtarget.hasAVX512()) {
45553 if (Subtarget.hasBWI())
45554 return SDValue();
45555 SExtVT = MVT::v64i8;
45556 break;
45557 }
45558 // Split if this is a <64 x i8> comparison result.
45559 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45560 SExtVT = MVT::v64i8;
45561 break;
45562 }
45563 return SDValue();
45564 };
45565
45566 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45567 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45568
45569 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45570 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45571 } else {
45572 if (SExtVT == MVT::v8i16) {
45573 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45574 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45575 }
45576 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45577 }
45578
45579 EVT IntVT =
45581 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45582 return DAG.getBitcast(VT, V);
45583}
45584
45585// Convert a vXi1 constant build vector to the same width scalar integer.
45587 EVT SrcVT = Op.getValueType();
45588 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45589 "Expected a vXi1 vector");
45591 "Expected a constant build vector");
45592
45593 APInt Imm(SrcVT.getVectorNumElements(), 0);
45594 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45595 SDValue In = Op.getOperand(Idx);
45596 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45597 Imm.setBit(Idx);
45598 }
45599 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45600 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45601}
45602
45605 const X86Subtarget &Subtarget) {
45606 using namespace SDPatternMatch;
45607 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45608
45609 if (!DCI.isBeforeLegalizeOps())
45610 return SDValue();
45611
45612 // Only do this if we have k-registers.
45613 if (!Subtarget.hasAVX512())
45614 return SDValue();
45615
45616 EVT DstVT = N->getValueType(0);
45617 SDValue Op = N->getOperand(0);
45618 EVT SrcVT = Op.getValueType();
45619
45620 // Make sure we have a bitcast between mask registers and a scalar type.
45621 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45622 DstVT.isScalarInteger()) &&
45623 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45624 SrcVT.isScalarInteger()))
45625 return SDValue();
45626
45627 SDValue LHS, RHS;
45628
45629 // Look for logic ops.
45631 return SDValue();
45632
45633 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45634 // least one of the getBitcast() will fold away).
45635 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45637 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45638 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45639
45640 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45641 // Most of these have to move a constant from the scalar domain anyway.
45644 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45645 DAG.getBitcast(DstVT, LHS), RHS);
45646 }
45647
45648 return SDValue();
45649}
45650
45652 const X86Subtarget &Subtarget) {
45653 SDLoc DL(BV);
45654 unsigned NumElts = BV->getNumOperands();
45655 SDValue Splat = BV->getSplatValue();
45656
45657 // Build MMX element from integer GPR or SSE float values.
45658 auto CreateMMXElement = [&](SDValue V) {
45659 if (V.isUndef())
45660 return DAG.getUNDEF(MVT::x86mmx);
45661 if (V.getValueType().isFloatingPoint()) {
45662 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45663 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45664 V = DAG.getBitcast(MVT::v2i64, V);
45665 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45666 }
45667 V = DAG.getBitcast(MVT::i32, V);
45668 } else {
45669 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45670 }
45671 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45672 };
45673
45674 // Convert build vector ops to MMX data in the bottom elements.
45676
45677 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45678
45679 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45680 if (Splat) {
45681 if (Splat.isUndef())
45682 return DAG.getUNDEF(MVT::x86mmx);
45683
45684 Splat = CreateMMXElement(Splat);
45685
45686 if (Subtarget.hasSSE1()) {
45687 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45688 if (NumElts == 8)
45689 Splat = DAG.getNode(
45690 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45691 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45692 TLI.getPointerTy(DAG.getDataLayout())),
45693 Splat, Splat);
45694
45695 // Use PSHUFW to repeat 16-bit elements.
45696 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45697 return DAG.getNode(
45698 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45699 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45700 TLI.getPointerTy(DAG.getDataLayout())),
45701 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45702 }
45703 Ops.append(NumElts, Splat);
45704 } else {
45705 for (unsigned i = 0; i != NumElts; ++i)
45706 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45707 }
45708
45709 // Use tree of PUNPCKLs to build up general MMX vector.
45710 while (Ops.size() > 1) {
45711 unsigned NumOps = Ops.size();
45712 unsigned IntrinOp =
45713 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45714 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45715 : Intrinsic::x86_mmx_punpcklbw));
45716 SDValue Intrin = DAG.getTargetConstant(
45717 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45718 for (unsigned i = 0; i != NumOps; i += 2)
45719 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45720 Ops[i], Ops[i + 1]);
45721 Ops.resize(NumOps / 2);
45722 }
45723
45724 return Ops[0];
45725}
45726
45727// Recursive function that attempts to find if a bool vector node was originally
45728// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45729// integer. If so, replace the scalar ops with bool vector equivalents back down
45730// the chain.
45732 SelectionDAG &DAG,
45733 const X86Subtarget &Subtarget,
45734 unsigned Depth = 0) {
45736 return SDValue(); // Limit search depth.
45737
45738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45739 unsigned Opc = V.getOpcode();
45740 switch (Opc) {
45741 case ISD::BITCAST: {
45742 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45743 SDValue Src = V.getOperand(0);
45744 EVT SrcVT = Src.getValueType();
45745 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45746 return DAG.getBitcast(VT, Src);
45747 break;
45748 }
45749 case ISD::Constant: {
45750 auto *C = cast<ConstantSDNode>(V);
45751 if (C->isZero())
45752 return DAG.getConstant(0, DL, VT);
45753 if (C->isAllOnes())
45754 return DAG.getAllOnesConstant(DL, VT);
45755 break;
45756 }
45757 case ISD::TRUNCATE: {
45758 // If we find a suitable source, a truncated scalar becomes a subvector.
45759 SDValue Src = V.getOperand(0);
45760 EVT NewSrcVT =
45761 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45762 if (TLI.isTypeLegal(NewSrcVT))
45763 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45764 Subtarget, Depth + 1))
45765 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45766 DAG.getVectorIdxConstant(0, DL));
45767 break;
45768 }
45769 case ISD::ANY_EXTEND:
45770 case ISD::ZERO_EXTEND: {
45771 // If we find a suitable source, an extended scalar becomes a subvector.
45772 SDValue Src = V.getOperand(0);
45773 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45774 Src.getScalarValueSizeInBits());
45775 if (TLI.isTypeLegal(NewSrcVT))
45776 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45777 Subtarget, Depth + 1))
45778 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45779 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45780 : DAG.getConstant(0, DL, VT),
45781 N0, DAG.getVectorIdxConstant(0, DL));
45782 break;
45783 }
45784 case ISD::OR:
45785 case ISD::XOR: {
45786 // If we find suitable sources, we can just move the op to the vector
45787 // domain.
45788 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45789 Subtarget, Depth + 1))
45790 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45791 Subtarget, Depth + 1))
45792 return DAG.getNode(Opc, DL, VT, N0, N1);
45793 break;
45794 }
45795 case ISD::SHL: {
45796 // If we find a suitable source, a SHL becomes a KSHIFTL.
45797 SDValue Src0 = V.getOperand(0);
45798 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45799 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45800 break;
45801
45802 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45803 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45804 Depth + 1))
45805 return DAG.getNode(
45806 X86ISD::KSHIFTL, DL, VT, N0,
45807 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45808 break;
45809 }
45810 }
45811
45812 // Does the inner bitcast already exist?
45813 if (Depth > 0)
45814 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45815 return SDValue(Alt, 0);
45816
45817 return SDValue();
45818}
45819
45822 const X86Subtarget &Subtarget) {
45823 SDValue N0 = N->getOperand(0);
45824 EVT VT = N->getValueType(0);
45825 EVT SrcVT = N0.getValueType();
45826 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45827
45828 // Try to match patterns such as
45829 // (i16 bitcast (v16i1 x))
45830 // ->
45831 // (i16 movmsk (16i8 sext (v16i1 x)))
45832 // before the setcc result is scalarized on subtargets that don't have legal
45833 // vxi1 types.
45834 if (DCI.isBeforeLegalize()) {
45835 SDLoc dl(N);
45836 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45837 return V;
45838
45839 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45840 // type, widen both sides to avoid a trip through memory.
45841 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45842 Subtarget.hasAVX512()) {
45843 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45844 N0 = DAG.getBitcast(MVT::v8i1, N0);
45845 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45846 DAG.getVectorIdxConstant(0, dl));
45847 }
45848
45849 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45850 // type, widen both sides to avoid a trip through memory.
45851 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45852 Subtarget.hasAVX512()) {
45853 // Use zeros for the widening if we already have some zeroes. This can
45854 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45855 // stream of this.
45856 // FIXME: It might make sense to detect a concat_vectors with a mix of
45857 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45858 // a separate combine. What we can't do is canonicalize the operands of
45859 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45860 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45861 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45862 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45863 SrcVT = LastOp.getValueType();
45864 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45866 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45867 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45868 N0 = DAG.getBitcast(MVT::i8, N0);
45869 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45870 }
45871 }
45872
45873 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45874 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45875 Ops[0] = N0;
45876 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45877 N0 = DAG.getBitcast(MVT::i8, N0);
45878 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45879 }
45880 } else if (DCI.isAfterLegalizeDAG()) {
45881 // If we're bitcasting from iX to vXi1, see if the integer originally
45882 // began as a vXi1 and whether we can remove the bitcast entirely.
45883 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45884 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45885 if (SDValue V =
45886 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45887 return V;
45888 }
45889 }
45890
45891 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45892 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45893 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45894 // we can help with known bits propagation from the vXi1 domain to the
45895 // scalar domain.
45896 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45897 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45898 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45900 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45901 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45902
45903 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45904 // and the vbroadcast_load are both integer or both fp. In some cases this
45905 // will remove the bitcast entirely.
45906 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45907 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45908 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45909 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45910 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45911 // Don't swap i8/i16 since don't have fp types that size.
45912 if (MemSize >= 32) {
45913 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45914 : MVT::getIntegerVT(MemSize);
45915 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45916 : MVT::getIntegerVT(SrcVTSize);
45917 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45918
45919 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45920 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45921 SDValue ResNode =
45923 MemVT, BCast->getMemOperand());
45924 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45925 return DAG.getBitcast(VT, ResNode);
45926 }
45927 }
45928
45929 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45930 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45931 SDValue Src = peekThroughTruncates(N0);
45932 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45933 Src.getOperand(0).getValueSizeInBits() == 128 &&
45934 isNullConstant(Src.getOperand(1))) {
45935 SDLoc DL(N);
45936 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45937 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45938 DAG.getVectorIdxConstant(0, DL));
45939 }
45940 }
45941
45942 // Since MMX types are special and don't usually play with other vector types,
45943 // it's better to handle them early to be sure we emit efficient code by
45944 // avoiding store-load conversions.
45945 if (VT == MVT::x86mmx) {
45946 // Detect MMX constant vectors.
45947 APInt UndefElts;
45948 SmallVector<APInt, 1> EltBits;
45949 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45950 /*AllowWholeUndefs*/ true,
45951 /*AllowPartialUndefs*/ true)) {
45952 SDLoc DL(N0);
45953 // Handle zero-extension of i32 with MOVD.
45954 if (EltBits[0].countl_zero() >= 32)
45955 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45956 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45957 // Else, bitcast to a double.
45958 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45959 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45960 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45961 }
45962
45963 // Detect bitcasts to x86mmx low word.
45964 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45965 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45966 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45967 bool LowUndef = true, AllUndefOrZero = true;
45968 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45969 SDValue Op = N0.getOperand(i);
45970 LowUndef &= Op.isUndef() || (i >= e/2);
45971 AllUndefOrZero &= isNullConstantOrUndef(Op);
45972 }
45973 if (AllUndefOrZero) {
45974 SDValue N00 = N0.getOperand(0);
45975 SDLoc dl(N00);
45976 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45977 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45978 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45979 }
45980 }
45981
45982 // Detect bitcasts of 64-bit build vectors and convert to a
45983 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45984 // lowest element.
45985 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45986 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45987 SrcVT == MVT::v8i8))
45988 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45989
45990 // Detect bitcasts between element or subvector extraction to x86mmx.
45991 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45993 isNullConstant(N0.getOperand(1))) {
45994 SDValue N00 = N0.getOperand(0);
45995 if (N00.getValueType().is128BitVector())
45996 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45997 DAG.getBitcast(MVT::v2i64, N00));
45998 }
45999
46000 // Detect bitcasts from FP_TO_SINT to x86mmx.
46001 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46002 SDLoc DL(N0);
46003 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46004 DAG.getUNDEF(MVT::v2i32));
46005 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46006 DAG.getBitcast(MVT::v2i64, Res));
46007 }
46008 }
46009
46010 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46011 // most of these to scalar anyway.
46012 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46013 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46015 return combinevXi1ConstantToInteger(N0, DAG);
46016 }
46017
46018 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46019 VT.getVectorElementType() == MVT::i1) {
46020 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46021 if (C->isAllOnes())
46022 return DAG.getConstant(1, SDLoc(N0), VT);
46023 if (C->isZero())
46024 return DAG.getConstant(0, SDLoc(N0), VT);
46025 }
46026 }
46027
46028 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46029 // Turn it into a sign bit compare that produces a k-register. This avoids
46030 // a trip through a GPR.
46031 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46032 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46034 unsigned NumElts = VT.getVectorNumElements();
46035 SDValue Src = N0;
46036
46037 // Peek through truncate.
46038 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46039 Src = N0.getOperand(0);
46040
46041 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46042 SDValue MovmskIn = Src.getOperand(0);
46043 MVT MovmskVT = MovmskIn.getSimpleValueType();
46044 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46045
46046 // We allow extra bits of the movmsk to be used since they are known zero.
46047 // We can't convert a VPMOVMSKB without avx512bw.
46048 if (MovMskElts <= NumElts &&
46049 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46050 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46051 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46052 SDLoc dl(N);
46053 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46054 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46055 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46056 if (EVT(CmpVT) == VT)
46057 return Cmp;
46058
46059 // Pad with zeroes up to original VT to replace the zeroes that were
46060 // being used from the MOVMSK.
46061 unsigned NumConcats = NumElts / MovMskElts;
46062 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46063 Ops[0] = Cmp;
46064 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46065 }
46066 }
46067 }
46068
46069 // Try to remove bitcasts from input and output of mask arithmetic to
46070 // remove GPR<->K-register crossings.
46071 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46072 return V;
46073
46074 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46075 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46076 SrcVT.getVectorNumElements() == 1)
46077 return N0.getOperand(1);
46078
46079 // Convert a bitcasted integer logic operation that has one bitcasted
46080 // floating-point operand into a floating-point logic operation. This may
46081 // create a load of a constant, but that is cheaper than materializing the
46082 // constant in an integer register and transferring it to an SSE register or
46083 // transferring the SSE operand to integer register and back.
46084 unsigned FPOpcode;
46085 switch (N0.getOpcode()) {
46086 // clang-format off
46087 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46088 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46089 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46090 default: return SDValue();
46091 // clang-format on
46092 }
46093
46094 // Check if we have a bitcast from another integer type as well.
46095 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46096 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46097 (Subtarget.hasFP16() && VT == MVT::f16) ||
46098 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46099 TLI.isTypeLegal(VT))))
46100 return SDValue();
46101
46102 SDValue LogicOp0 = N0.getOperand(0);
46103 SDValue LogicOp1 = N0.getOperand(1);
46104 SDLoc DL0(N0);
46105
46106 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46107 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46108 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46109 LogicOp0.getOperand(0).getValueType() == VT &&
46110 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46111 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46112 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46113 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46114 }
46115 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46116 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46117 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46118 LogicOp1.getOperand(0).getValueType() == VT &&
46119 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46120 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46121 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46122 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46123 }
46124
46125 return SDValue();
46126}
46127
46128// (mul (zext a), (sext, b))
46129static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46130 SDValue &Op1) {
46131 Op0 = Mul.getOperand(0);
46132 Op1 = Mul.getOperand(1);
46133
46134 // The operand1 should be signed extend
46135 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46136 std::swap(Op0, Op1);
46137
46138 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46139 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46140 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46141 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46142 return true;
46143
46144 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46145 return (BV && BV->isConstant());
46146 };
46147
46148 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46149 // value, we need to check Op0 is zero extended value. Op1 should be signed
46150 // value, so we just check the signed bits.
46151 if ((IsFreeTruncation(Op0) &&
46152 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46153 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46154 return true;
46155
46156 return false;
46157}
46158
46160 unsigned &LogBias, const SDLoc &DL,
46161 const X86Subtarget &Subtarget) {
46162 // Extend or truncate to MVT::i8 first.
46163 MVT Vi8VT =
46164 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46165 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46166 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46167
46168 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46169 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46170 // The src A, B element type is i8, but the dst C element type is i32.
46171 // When we calculate the reduce stage, we use src vector type vXi8 for it
46172 // so we need logbias 2 to avoid extra 2 stages.
46173 LogBias = 2;
46174
46175 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46176 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46177 RegSize = std::max(512u, RegSize);
46178
46179 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46180 // fill in the missing vector elements with 0.
46181 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46182 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46183 Ops[0] = LHS;
46184 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46185 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46186 Ops[0] = RHS;
46187 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46188
46189 // Actually build the DotProduct, split as 256/512 bits for
46190 // AVXVNNI/AVX512VNNI.
46191 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46193 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46194 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46195 };
46196 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46197 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46198
46199 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46200 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46201}
46202
46203// Create a PSADBW given two sources representable as zexts of vXi8.
46205 const SDLoc &DL, const X86Subtarget &Subtarget) {
46206 // Find the appropriate width for the PSADBW.
46207 EVT DstVT = N0.getValueType();
46208 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46209 DstVT.getVectorElementCount());
46210 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46211
46212 // Widen the vXi8 vectors, padding with zero vector elements.
46213 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46214 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46215 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46216 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46217 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46218 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46219 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46220
46221 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46222 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46224 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46225 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46226 };
46227 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46228 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46229 PSADBWBuilder);
46230}
46231
46232// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46233// PHMINPOSUW.
46235 const X86Subtarget &Subtarget) {
46236 // Bail without SSE41.
46237 if (!Subtarget.hasSSE41())
46238 return SDValue();
46239
46240 EVT ExtractVT = Extract->getValueType(0);
46241 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46242 return SDValue();
46243
46244 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46245 ISD::NodeType BinOp;
46246 SDValue Src = DAG.matchBinOpReduction(
46247 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46248 if (!Src)
46249 return SDValue();
46250
46251 EVT SrcVT = Src.getValueType();
46252 EVT SrcSVT = SrcVT.getScalarType();
46253 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46254 return SDValue();
46255
46256 SDLoc DL(Extract);
46257 SDValue MinPos = Src;
46258
46259 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46260 while (SrcVT.getSizeInBits() > 128) {
46261 SDValue Lo, Hi;
46262 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46263 SrcVT = Lo.getValueType();
46264 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46265 }
46266 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46267 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46268 "Unexpected value type");
46269
46270 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46271 // to flip the value accordingly.
46272 SDValue Mask;
46273 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46274 if (BinOp == ISD::SMAX)
46275 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46276 else if (BinOp == ISD::SMIN)
46277 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46278 else if (BinOp == ISD::UMAX)
46279 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46280
46281 if (Mask)
46282 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46283
46284 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46285 // shuffling each upper element down and insert zeros. This means that the
46286 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46287 // ready for the PHMINPOS.
46288 if (ExtractVT == MVT::i8) {
46290 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46291 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46292 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46293 }
46294
46295 // Perform the PHMINPOS on a v8i16 vector,
46296 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46297 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46298 MinPos = DAG.getBitcast(SrcVT, MinPos);
46299
46300 if (Mask)
46301 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46302
46303 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46304 DAG.getVectorIdxConstant(0, DL));
46305}
46306
46307// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46309 const X86Subtarget &Subtarget) {
46310 // Bail without SSE2.
46311 if (!Subtarget.hasSSE2())
46312 return SDValue();
46313
46314 EVT ExtractVT = Extract->getValueType(0);
46315 unsigned BitWidth = ExtractVT.getSizeInBits();
46316 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46317 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46318 return SDValue();
46319
46320 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46321 ISD::NodeType BinOp;
46322 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46323 if (!Match && ExtractVT == MVT::i1)
46324 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46325 if (!Match)
46326 return SDValue();
46327
46328 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46329 // which we can't support here for now.
46330 if (Match.getScalarValueSizeInBits() != BitWidth)
46331 return SDValue();
46332
46333 SDValue Movmsk;
46334 SDLoc DL(Extract);
46335 EVT MatchVT = Match.getValueType();
46336 unsigned NumElts = MatchVT.getVectorNumElements();
46337 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46339 LLVMContext &Ctx = *DAG.getContext();
46340
46341 if (ExtractVT == MVT::i1) {
46342 // Special case for (pre-legalization) vXi1 reductions.
46343 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46344 return SDValue();
46345 if (Match.getOpcode() == ISD::SETCC) {
46346 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46347 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46348 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46349 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46350 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46351 X86::CondCode X86CC;
46352 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46353 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46354 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46355 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46356 DAG, X86CC))
46357 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46358 getSETCC(X86CC, V, DL, DAG));
46359 }
46360 }
46361 if (TLI.isTypeLegal(MatchVT)) {
46362 // If this is a legal AVX512 predicate type then we can just bitcast.
46363 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46364 Movmsk = DAG.getBitcast(MovmskVT, Match);
46365 } else {
46366 // Use combineBitcastvxi1 to create the MOVMSK.
46367 while (NumElts > MaxElts) {
46368 SDValue Lo, Hi;
46369 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46370 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46371 NumElts /= 2;
46372 }
46373 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46374 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46375 }
46376 if (!Movmsk)
46377 return SDValue();
46378 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46379 } else {
46380 // FIXME: Better handling of k-registers or 512-bit vectors?
46381 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46382 if (!(MatchSizeInBits == 128 ||
46383 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46384 return SDValue();
46385
46386 // Make sure this isn't a vector of 1 element. The perf win from using
46387 // MOVMSK diminishes with less elements in the reduction, but it is
46388 // generally better to get the comparison over to the GPRs as soon as
46389 // possible to reduce the number of vector ops.
46390 if (Match.getValueType().getVectorNumElements() < 2)
46391 return SDValue();
46392
46393 // Check that we are extracting a reduction of all sign bits.
46394 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46395 return SDValue();
46396
46397 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46398 SDValue Lo, Hi;
46399 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46400 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46401 MatchSizeInBits = Match.getValueSizeInBits();
46402 }
46403
46404 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46405 MVT MaskSrcVT;
46406 if (64 == BitWidth || 32 == BitWidth)
46408 MatchSizeInBits / BitWidth);
46409 else
46410 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46411
46412 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46413 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46414 NumElts = MaskSrcVT.getVectorNumElements();
46415 }
46416 assert((NumElts <= 32 || NumElts == 64) &&
46417 "Not expecting more than 64 elements");
46418
46419 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46420 if (BinOp == ISD::XOR) {
46421 // parity -> (PARITY(MOVMSK X))
46422 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46423 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46424 }
46425
46426 SDValue CmpC;
46427 ISD::CondCode CondCode;
46428 if (BinOp == ISD::OR) {
46429 // any_of -> MOVMSK != 0
46430 CmpC = DAG.getConstant(0, DL, CmpVT);
46431 CondCode = ISD::CondCode::SETNE;
46432 } else {
46433 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46434 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46435 DL, CmpVT);
46436 CondCode = ISD::CondCode::SETEQ;
46437 }
46438
46439 // The setcc produces an i8 of 0/1, so extend that to the result width and
46440 // negate to get the final 0/-1 mask value.
46441 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46442 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46443 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46444 return DAG.getNegative(Zext, DL, ExtractVT);
46445}
46446
46448 const X86Subtarget &Subtarget) {
46449 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46450 return SDValue();
46451
46452 EVT ExtractVT = Extract->getValueType(0);
46453 // Verify the type we're extracting is i32, as the output element type of
46454 // vpdpbusd is i32.
46455 if (ExtractVT != MVT::i32)
46456 return SDValue();
46457
46458 EVT VT = Extract->getOperand(0).getValueType();
46460 return SDValue();
46461
46462 // Match shuffle + add pyramid.
46463 ISD::NodeType BinOp;
46464 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46465
46466 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46467 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46468 // before adding into the accumulator.
46469 // TODO:
46470 // We also need to verify that the multiply has at least 2x the number of bits
46471 // of the input. We shouldn't match
46472 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46473 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46474 // Root = Root.getOperand(0);
46475
46476 // If there was a match, we want Root to be a mul.
46477 if (!Root || Root.getOpcode() != ISD::MUL)
46478 return SDValue();
46479
46480 // Check whether we have an extend and mul pattern
46481 SDValue LHS, RHS;
46482 if (!detectExtMul(DAG, Root, LHS, RHS))
46483 return SDValue();
46484
46485 // Create the dot product instruction.
46486 SDLoc DL(Extract);
46487 unsigned StageBias;
46488 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46489
46490 // If the original vector was wider than 4 elements, sum over the results
46491 // in the DP vector.
46492 unsigned Stages = Log2_32(VT.getVectorNumElements());
46493 EVT DpVT = DP.getValueType();
46494
46495 if (Stages > StageBias) {
46496 unsigned DpElems = DpVT.getVectorNumElements();
46497
46498 for (unsigned i = Stages - StageBias; i > 0; --i) {
46499 SmallVector<int, 16> Mask(DpElems, -1);
46500 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46501 Mask[j] = MaskEnd + j;
46502
46503 SDValue Shuffle =
46504 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46505 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46506 }
46507 }
46508
46509 // Return the lowest ExtractSizeInBits bits.
46510 EVT ResVT =
46511 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46512 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46513 DP = DAG.getBitcast(ResVT, DP);
46514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46515 Extract->getOperand(1));
46516}
46517
46519 const X86Subtarget &Subtarget) {
46520 using namespace SDPatternMatch;
46521
46522 // PSADBW is only supported on SSE2 and up.
46523 if (!Subtarget.hasSSE2())
46524 return SDValue();
46525
46526 EVT ExtractVT = Extract->getValueType(0);
46527 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46528 ExtractVT != MVT::i64)
46529 return SDValue();
46530
46531 EVT VT = Extract->getOperand(0).getValueType();
46533 return SDValue();
46534
46535 // Match shuffle + add pyramid.
46536 ISD::NodeType BinOp;
46537 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46538 if (!Root)
46539 return SDValue();
46540
46541 // The operand is expected to be zero extended from i8.
46542 // In order to convert to i64 and above, additional any/zero/sign
46543 // extend is expected.
46544 // The zero extend from 32 bit has no mathematical effect on the result.
46545 // Also the sign extend is basically zero extend
46546 // (extends the sign bit which is zero).
46547 // So it is correct to skip the sign/zero extend instruction.
46548 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46549 Root.getOpcode() == ISD::ZERO_EXTEND ||
46550 Root.getOpcode() == ISD::ANY_EXTEND)
46551 Root = Root.getOperand(0);
46552
46553 // Check whether we have an vXi8 abdu pattern.
46554 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46555 SDValue Src0, Src1;
46556 if (!sd_match(
46557 Root,
46558 m_AnyOf(
46560 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46562 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46563 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46564 m_Abs(
46565 m_Sub(m_AllOf(m_Value(Src0),
46567 m_AllOf(m_Value(Src1),
46568 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46569 return SDValue();
46570
46571 // Create the SAD instruction.
46572 SDLoc DL(Extract);
46573 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46574
46575 // If the original vector was wider than 8 elements, sum over the results
46576 // in the SAD vector.
46577 unsigned Stages = Log2_32(VT.getVectorNumElements());
46578 EVT SadVT = SAD.getValueType();
46579 if (Stages > 3) {
46580 unsigned SadElems = SadVT.getVectorNumElements();
46581
46582 for(unsigned i = Stages - 3; i > 0; --i) {
46583 SmallVector<int, 16> Mask(SadElems, -1);
46584 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46585 Mask[j] = MaskEnd + j;
46586
46587 SDValue Shuffle =
46588 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46589 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46590 }
46591 }
46592
46593 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46594 // Return the lowest ExtractSizeInBits bits.
46595 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46596 SadVT.getSizeInBits() / ExtractSizeInBits);
46597 SAD = DAG.getBitcast(ResVT, SAD);
46598 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46599 Extract->getOperand(1));
46600}
46601
46602// If this extract is from a loaded vector value and will be used as an
46603// integer, that requires a potentially expensive XMM -> GPR transfer.
46604// Additionally, if we can convert to a scalar integer load, that will likely
46605// be folded into a subsequent integer op.
46606// Note: SrcVec might not have a VecVT type, but it must be the same size.
46607// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46608// to a single-use of the loaded vector. For the reasons above, we
46609// expect this to be profitable even if it creates an extra load.
46610static SDValue
46612 const SDLoc &dl, SelectionDAG &DAG,
46614 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46615 "Only EXTRACT_VECTOR_ELT supported so far");
46616
46617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46618 EVT VT = N->getValueType(0);
46619
46620 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46621 return Use->getOpcode() == ISD::STORE ||
46622 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46623 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46624 });
46625
46626 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46627 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46628 VecVT.getVectorElementType() == VT &&
46629 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46630 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46631 SDValue NewPtr = TLI.getVectorElementPointer(
46632 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46633 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46634 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46635 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46636 SDValue Load =
46637 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46638 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46639 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46640 return Load;
46641 }
46642
46643 return SDValue();
46644}
46645
46646// Attempt to peek through a target shuffle and extract the scalar from the
46647// source.
46650 const X86Subtarget &Subtarget) {
46651 if (DCI.isBeforeLegalizeOps())
46652 return SDValue();
46653
46654 SDLoc dl(N);
46655 SDValue Src = N->getOperand(0);
46656 SDValue Idx = N->getOperand(1);
46657
46658 EVT VT = N->getValueType(0);
46659 EVT SrcVT = Src.getValueType();
46660 EVT SrcSVT = SrcVT.getVectorElementType();
46661 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46662 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46663
46664 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46665 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46666 return SDValue();
46667
46668 const APInt &IdxC = N->getConstantOperandAPInt(1);
46669 if (IdxC.uge(NumSrcElts))
46670 return SDValue();
46671
46672 SDValue SrcBC = peekThroughBitcasts(Src);
46673
46674 // Handle extract(bitcast(broadcast(scalar_value))).
46675 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46676 SDValue SrcOp = SrcBC.getOperand(0);
46677 EVT SrcOpVT = SrcOp.getValueType();
46678 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46679 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46680 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46681 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46682 // TODO support non-zero offsets.
46683 if (Offset == 0) {
46684 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46685 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46686 return SrcOp;
46687 }
46688 }
46689 }
46690
46691 // If we're extracting a single element from a broadcast load and there are
46692 // no other users, just create a single load.
46694 SrcBC.hasOneUse()) {
46695 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46696 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46697 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46698 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46699 SDValue Load =
46700 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46701 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46702 MemIntr->getMemOperand()->getFlags());
46703 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46704 return Load;
46705 }
46706 }
46707
46708 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46709 // TODO: Move to DAGCombine?
46710 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46711 SrcBC.getValueType().isInteger() &&
46712 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46713 SrcBC.getScalarValueSizeInBits() ==
46714 SrcBC.getOperand(0).getValueSizeInBits()) {
46715 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46716 if (IdxC.ult(Scale)) {
46717 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46718 SDValue Scl = SrcBC.getOperand(0);
46719 EVT SclVT = Scl.getValueType();
46720 if (Offset) {
46721 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46722 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46723 }
46724 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46725 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46726 return Scl;
46727 }
46728 }
46729
46730 // Handle extract(truncate(x)) for 0'th index.
46731 // TODO: Treat this as a faux shuffle?
46732 // TODO: When can we use this for general indices?
46733 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46734 (SrcVT.getSizeInBits() % 128) == 0) {
46735 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46736 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46737 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46738 Idx);
46739 }
46740
46741 // We can only legally extract other elements from 128-bit vectors and in
46742 // certain circumstances, depending on SSE-level.
46743 // TODO: Investigate float/double extraction if it will be just stored.
46744 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46745 unsigned Idx) {
46746 EVT VecSVT = VecVT.getScalarType();
46747 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46748 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46749 VecSVT == MVT::i64)) {
46750 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46751 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46752 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46753 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46754 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46755 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46756 Idx &= (NumEltsPerLane - 1);
46757 }
46758 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46759 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46760 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46761 DAG.getBitcast(VecVT, Vec),
46762 DAG.getVectorIdxConstant(Idx, dl));
46763 }
46764 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46765 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46766 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46767 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46768 DAG.getTargetConstant(Idx, dl, MVT::i8));
46769 }
46770 return SDValue();
46771 };
46772
46773 // Resolve the target shuffle inputs and mask.
46776 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46777 return SDValue();
46778
46779 // Shuffle inputs must be the same size as the result.
46780 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46781 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46782 }))
46783 return SDValue();
46784
46785 // Attempt to narrow/widen the shuffle mask to the correct size.
46786 if (Mask.size() != NumSrcElts) {
46787 if ((NumSrcElts % Mask.size()) == 0) {
46788 SmallVector<int, 16> ScaledMask;
46789 int Scale = NumSrcElts / Mask.size();
46790 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46791 Mask = std::move(ScaledMask);
46792 } else if ((Mask.size() % NumSrcElts) == 0) {
46793 // Simplify Mask based on demanded element.
46794 int ExtractIdx = (int)IdxC.getZExtValue();
46795 int Scale = Mask.size() / NumSrcElts;
46796 int Lo = Scale * ExtractIdx;
46797 int Hi = Scale * (ExtractIdx + 1);
46798 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46799 if (i < Lo || Hi <= i)
46800 Mask[i] = SM_SentinelUndef;
46801
46802 SmallVector<int, 16> WidenedMask;
46803 while (Mask.size() > NumSrcElts &&
46804 canWidenShuffleElements(Mask, WidenedMask))
46805 Mask = std::move(WidenedMask);
46806 }
46807 }
46808
46809 // If narrowing/widening failed, see if we can extract+zero-extend.
46810 int ExtractIdx;
46811 EVT ExtractVT;
46812 if (Mask.size() == NumSrcElts) {
46813 ExtractIdx = Mask[IdxC.getZExtValue()];
46814 ExtractVT = SrcVT;
46815 } else {
46816 unsigned Scale = Mask.size() / NumSrcElts;
46817 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46818 return SDValue();
46819 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46820 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46821 return SDValue();
46822 ExtractIdx = Mask[ScaledIdx];
46823 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46824 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46825 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46826 "Failed to widen vector type");
46827 }
46828
46829 // If the shuffle source element is undef/zero then we can just accept it.
46830 if (ExtractIdx == SM_SentinelUndef)
46831 return DAG.getUNDEF(VT);
46832
46833 if (ExtractIdx == SM_SentinelZero)
46834 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46835 : DAG.getConstant(0, dl, VT);
46836
46837 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46838 ExtractIdx = ExtractIdx % Mask.size();
46839 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46840 return DAG.getZExtOrTrunc(V, dl, VT);
46841
46842 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46844 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46845 return V;
46846
46847 return SDValue();
46848}
46849
46850/// Extracting a scalar FP value from vector element 0 is free, so extract each
46851/// operand first, then perform the math as a scalar op.
46853 const X86Subtarget &Subtarget,
46855 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46856 SDValue Vec = ExtElt->getOperand(0);
46857 SDValue Index = ExtElt->getOperand(1);
46858 EVT VT = ExtElt->getValueType(0);
46859 EVT VecVT = Vec.getValueType();
46860
46861 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46862 // non-zero element because the shuffle+scalar op will be cheaper?
46863 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46864 return SDValue();
46865
46866 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46867 // extract, the condition code), so deal with those as a special-case.
46868 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46869 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46870 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46871 return SDValue();
46872
46873 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46874 SDLoc DL(ExtElt);
46875 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46876 Vec.getOperand(0), Index);
46877 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46878 Vec.getOperand(1), Index);
46879 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46880 }
46881
46882 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46883 VT != MVT::f64)
46884 return SDValue();
46885
46886 // Vector FP selects don't fit the pattern of FP math ops (because the
46887 // condition has a different type and we have to change the opcode), so deal
46888 // with those here.
46889 // FIXME: This is restricted to pre type legalization. If we loosen this we
46890 // need to convert vector bool to a scalar bool.
46891 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46892 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46893 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46894 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46895 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46896 SDLoc DL(ExtElt);
46899 Vec.getOperand(0), Index);
46900 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46901 Vec.getOperand(1), Index);
46902 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46903 Vec.getOperand(2), Index);
46904 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46905 }
46906
46907 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46908 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46909 // missed load folding and fma+fneg combining.
46910 switch (Vec.getOpcode()) {
46911 case ISD::FMA: // Begin 3 operands
46912 case ISD::FMAD:
46913 case ISD::FADD: // Begin 2 operands
46914 case ISD::FSUB:
46915 case ISD::FMUL:
46916 case ISD::FDIV:
46917 case ISD::FREM:
46918 case ISD::FCOPYSIGN:
46919 case ISD::FMINNUM:
46920 case ISD::FMAXNUM:
46921 case ISD::FMINNUM_IEEE:
46922 case ISD::FMAXNUM_IEEE:
46923 case ISD::FMAXIMUM:
46924 case ISD::FMINIMUM:
46925 case ISD::FMAXIMUMNUM:
46926 case ISD::FMINIMUMNUM:
46927 case X86ISD::FMAX:
46928 case X86ISD::FMIN:
46929 case ISD::FABS: // Begin 1 operand
46930 case ISD::FSQRT:
46931 case ISD::FRINT:
46932 case ISD::FCEIL:
46933 case ISD::FTRUNC:
46934 case ISD::FNEARBYINT:
46935 case ISD::FROUNDEVEN:
46936 case ISD::FROUND:
46937 case ISD::FFLOOR:
46938 case X86ISD::FRCP:
46939 case X86ISD::FRSQRT: {
46940 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46941 SDLoc DL(ExtElt);
46943 for (SDValue Op : Vec->ops())
46944 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46945 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46946 }
46947 default:
46948 return SDValue();
46949 }
46950 llvm_unreachable("All opcodes should return within switch");
46951}
46952
46953/// Try to convert a vector reduction sequence composed of binops and shuffles
46954/// into horizontal ops.
46956 const X86Subtarget &Subtarget) {
46957 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46958
46959 // We need at least SSE2 to anything here.
46960 if (!Subtarget.hasSSE2())
46961 return SDValue();
46962
46964 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46965 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46966 if (!Rdx)
46967 return SDValue();
46968
46969 SDValue Index = ExtElt->getOperand(1);
46970 assert(isNullConstant(Index) &&
46971 "Reduction doesn't end in an extract from index 0");
46972
46973 EVT VT = ExtElt->getValueType(0);
46974 EVT VecVT = Rdx.getValueType();
46975 if (VecVT.getScalarType() != VT)
46976 return SDValue();
46977
46978 SDLoc DL(ExtElt);
46979 unsigned NumElts = VecVT.getVectorNumElements();
46980 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46981
46982 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46983 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46984 if (V.getValueType() == MVT::v4i8) {
46985 if (ZeroExtend && Subtarget.hasSSE41()) {
46986 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46987 DAG.getConstant(0, DL, MVT::v4i32),
46988 DAG.getBitcast(MVT::i32, V),
46989 DAG.getVectorIdxConstant(0, DL));
46990 return DAG.getBitcast(MVT::v16i8, V);
46991 }
46992 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46993 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46994 : DAG.getUNDEF(MVT::v4i8));
46995 }
46996 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46997 DAG.getUNDEF(MVT::v8i8));
46998 };
46999
47000 // vXi8 mul reduction - promote to vXi16 mul reduction.
47001 if (Opc == ISD::MUL) {
47002 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47003 return SDValue();
47004 if (VecVT.getSizeInBits() >= 128) {
47005 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47006 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47007 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47008 Lo = DAG.getBitcast(WideVT, Lo);
47009 Hi = DAG.getBitcast(WideVT, Hi);
47010 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47011 while (Rdx.getValueSizeInBits() > 128) {
47012 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47013 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47014 }
47015 } else {
47016 Rdx = WidenToV16I8(Rdx, false);
47017 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47018 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47019 }
47020 if (NumElts >= 8)
47021 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47022 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47023 {4, 5, 6, 7, -1, -1, -1, -1}));
47024 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47025 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47026 {2, 3, -1, -1, -1, -1, -1, -1}));
47027 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47028 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47029 {1, -1, -1, -1, -1, -1, -1, -1}));
47030 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47031 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47032 }
47033
47034 // vXi8 add reduction - sub 128-bit vector.
47035 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47036 Rdx = WidenToV16I8(Rdx, true);
47037 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47038 DAG.getConstant(0, DL, MVT::v16i8));
47039 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47040 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47041 }
47042
47043 // Must be a >=128-bit vector with pow2 elements.
47044 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47045 return SDValue();
47046
47047 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47048 if (VT == MVT::i8) {
47049 while (Rdx.getValueSizeInBits() > 128) {
47050 SDValue Lo, Hi;
47051 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47052 VecVT = Lo.getValueType();
47053 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47054 }
47055 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47056
47058 MVT::v16i8, DL, Rdx, Rdx,
47059 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47060 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47061 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47062 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47063 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47065 }
47066
47067 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47068 // If the source vector values are 0-255, then we can use PSADBW to
47069 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47070 // TODO: See if its worth avoiding vXi16/i32 truncations?
47071 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47072 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47073 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47074 Subtarget.hasAVX512())) {
47075 if (Rdx.getValueType() == MVT::v8i16) {
47076 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47077 DAG.getUNDEF(MVT::v8i16));
47078 } else {
47079 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47080 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47081 if (ByteVT.getSizeInBits() < 128)
47082 Rdx = WidenToV16I8(Rdx, true);
47083 }
47084
47085 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47086 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47088 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47089 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47090 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47091 };
47092 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47093 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47094
47095 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47096 while (Rdx.getValueSizeInBits() > 128) {
47097 SDValue Lo, Hi;
47098 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47099 VecVT = Lo.getValueType();
47100 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47101 }
47102 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47103
47104 if (NumElts > 8) {
47105 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47106 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47107 }
47108
47109 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47110 Rdx = DAG.getBitcast(VecVT, Rdx);
47111 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47112 }
47113
47114 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47115 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47116 return SDValue();
47117
47118 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47119
47120 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47121 // across the whole vector, so we need an extract + hop preliminary stage.
47122 // This is the only step where the operands of the hop are not the same value.
47123 // TODO: We could extend this to handle 512-bit or even longer vectors.
47124 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47125 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47126 unsigned NumElts = VecVT.getVectorNumElements();
47127 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47128 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47129 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47130 VecVT = Rdx.getValueType();
47131 }
47132 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47133 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47134 return SDValue();
47135
47136 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47137 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47138 for (unsigned i = 0; i != ReductionSteps; ++i)
47139 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47140
47141 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47142}
47143
47144/// Detect vector gather/scatter index generation and convert it from being a
47145/// bunch of shuffles and extracts into a somewhat faster sequence.
47146/// For i686, the best sequence is apparently storing the value and loading
47147/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47150 const X86Subtarget &Subtarget) {
47151 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47152 return NewOp;
47153
47154 SDValue InputVector = N->getOperand(0);
47155 SDValue EltIdx = N->getOperand(1);
47156 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47157
47158 EVT SrcVT = InputVector.getValueType();
47159 EVT VT = N->getValueType(0);
47160 SDLoc dl(InputVector);
47161 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47162 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47163 unsigned NumEltBits = VT.getScalarSizeInBits();
47164 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47165
47166 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47167 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47168
47169 // Integer Constant Folding.
47170 if (CIdx && VT.isInteger()) {
47171 APInt UndefVecElts;
47172 SmallVector<APInt, 16> EltBits;
47173 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47174 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47175 EltBits, /*AllowWholeUndefs*/ true,
47176 /*AllowPartialUndefs*/ false)) {
47177 uint64_t Idx = CIdx->getZExtValue();
47178 if (UndefVecElts[Idx])
47179 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47180 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47181 }
47182
47183 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47184 // Improves lowering of bool masks on rust which splits them into byte array.
47185 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47186 SDValue Src = peekThroughBitcasts(InputVector);
47187 if (Src.getValueType().getScalarType() == MVT::i1 &&
47188 TLI.isTypeLegal(Src.getValueType())) {
47189 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47190 SDValue Sub = DAG.getNode(
47191 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47192 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47193 return DAG.getBitcast(VT, Sub);
47194 }
47195 }
47196 }
47197
47198 if (IsPextr) {
47199 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47200 DCI))
47201 return SDValue(N, 0);
47202
47203 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47204 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47205 InputVector.getOpcode() == X86ISD::PINSRW) &&
47206 InputVector.getOperand(2) == EltIdx) {
47207 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47208 "Vector type mismatch");
47209 SDValue Scl = InputVector.getOperand(1);
47210 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47211 return DAG.getZExtOrTrunc(Scl, dl, VT);
47212 }
47213
47214 // TODO - Remove this once we can handle the implicit zero-extension of
47215 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47216 // combineBasicSADPattern.
47217 return SDValue();
47218 }
47219
47220 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47221 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47222 InputVector.getOpcode() == ISD::BITCAST &&
47223 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47224 isNullConstant(EltIdx) && InputVector.hasOneUse())
47225 return DAG.getBitcast(VT, InputVector);
47226
47227 // Detect mmx to i32 conversion through a v2i32 elt extract.
47228 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47229 InputVector.getOpcode() == ISD::BITCAST &&
47230 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47231 isNullConstant(EltIdx) && InputVector.hasOneUse())
47232 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47233 InputVector.getOperand(0));
47234
47235 // Check whether this extract is the root of a sum of absolute differences
47236 // pattern. This has to be done here because we really want it to happen
47237 // pre-legalization,
47238 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47239 return SAD;
47240
47241 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47242 return VPDPBUSD;
47243
47244 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47245 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47246 return Cmp;
47247
47248 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47249 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47250 return MinMax;
47251
47252 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47253 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47254 return V;
47255
47256 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47257 return V;
47258
47259 if (CIdx)
47261 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47262 dl, DAG, DCI))
47263 return V;
47264
47265 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47266 // and then testing the relevant element.
47267 //
47268 // Note that we only combine extracts on the *same* result number, i.e.
47269 // t0 = merge_values a0, a1, a2, a3
47270 // i1 = extract_vector_elt t0, Constant:i64<2>
47271 // i1 = extract_vector_elt t0, Constant:i64<3>
47272 // but not
47273 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47274 // since the latter would need its own MOVMSK.
47275 if (SrcVT.getScalarType() == MVT::i1) {
47276 bool IsVar = !CIdx;
47277 SmallVector<SDNode *, 16> BoolExtracts;
47278 unsigned ResNo = InputVector.getResNo();
47279 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47280 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47281 Use->getOperand(0).getResNo() == ResNo &&
47282 Use->getValueType(0) == MVT::i1) {
47283 BoolExtracts.push_back(Use);
47284 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47285 return true;
47286 }
47287 return false;
47288 };
47289 // TODO: Can we drop the oneuse check for constant extracts?
47290 if (all_of(InputVector->users(), IsBoolExtract) &&
47291 (IsVar || BoolExtracts.size() > 1)) {
47292 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47293 if (SDValue BC =
47294 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47295 for (SDNode *Use : BoolExtracts) {
47296 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47297 // Mask = 1 << MaskIdx
47298 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47299 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47300 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47301 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47302 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47303 DCI.CombineTo(Use, Res);
47304 }
47305 return SDValue(N, 0);
47306 }
47307 }
47308 }
47309
47310 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47311 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47312 SDValue TruncSrc = InputVector.getOperand(0);
47313 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47314 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47315 SDValue NewExt =
47316 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47317 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47318 }
47319 }
47320
47321 return SDValue();
47322}
47323
47324// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47325// This is more or less the reverse of combineBitcastvxi1.
47327 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47328 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47329 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47330 Opcode != ISD::ANY_EXTEND)
47331 return SDValue();
47332 if (!DCI.isBeforeLegalizeOps())
47333 return SDValue();
47334 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47335 return SDValue();
47336
47337 EVT SVT = VT.getScalarType();
47338 EVT InSVT = N0.getValueType().getScalarType();
47339 unsigned EltSizeInBits = SVT.getSizeInBits();
47340
47341 // Input type must be extending a bool vector (bit-casted from a scalar
47342 // integer) to legal integer types.
47343 if (!VT.isVector())
47344 return SDValue();
47345 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47346 return SDValue();
47347 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47348 return SDValue();
47349
47350 SDValue N00 = N0.getOperand(0);
47351 EVT SclVT = N00.getValueType();
47352 if (!SclVT.isScalarInteger())
47353 return SDValue();
47354
47355 SDValue Vec;
47356 SmallVector<int> ShuffleMask;
47357 unsigned NumElts = VT.getVectorNumElements();
47358 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47359
47360 // Broadcast the scalar integer to the vector elements.
47361 if (NumElts > EltSizeInBits) {
47362 // If the scalar integer is greater than the vector element size, then we
47363 // must split it down into sub-sections for broadcasting. For example:
47364 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47365 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47366 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47367 unsigned Scale = NumElts / EltSizeInBits;
47368 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47369 bool UseBroadcast = Subtarget.hasInt256() &&
47370 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47371 Vec = UseBroadcast
47372 ? DAG.getSplat(BroadcastVT, DL, N00)
47373 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47374 Vec = DAG.getBitcast(VT, Vec);
47375
47376 for (unsigned i = 0; i != Scale; ++i) {
47377 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47378 ShuffleMask.append(EltSizeInBits, i + Offset);
47379 }
47380 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47381 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47382 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47383 // If we have register broadcast instructions, use the scalar size as the
47384 // element type for the shuffle. Then cast to the wider element type. The
47385 // widened bits won't be used, and this might allow the use of a broadcast
47386 // load.
47387 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47388 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47389 (NumElts * EltSizeInBits) / NumElts);
47390 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47391 } else {
47392 // For smaller scalar integers, we can simply any-extend it to the vector
47393 // element size (we don't care about the upper bits) and broadcast it to all
47394 // elements.
47395 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47396 }
47397
47398 // Now, mask the relevant bit in each element.
47400 for (unsigned i = 0; i != NumElts; ++i) {
47401 int BitIdx = (i % EltSizeInBits);
47402 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47403 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47404 }
47405 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47406 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47407
47408 // Compare against the bitmask and extend the result.
47409 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47410 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47411 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47412
47413 // For SEXT, this is now done, otherwise shift the result down for
47414 // zero-extension.
47415 if (Opcode == ISD::SIGN_EXTEND)
47416 return Vec;
47417 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47418 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47419}
47420
47421/// If both arms of a vector select are concatenated vectors, split the select,
47422/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47423/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47424/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47426 const X86Subtarget &Subtarget) {
47427 unsigned Opcode = N->getOpcode();
47428 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47429 return SDValue();
47430
47431 // TODO: Split 512-bit vectors too?
47432 EVT VT = N->getValueType(0);
47433 if (!VT.is256BitVector())
47434 return SDValue();
47435
47436 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47437 SDValue Cond = N->getOperand(0);
47438 SDValue TVal = N->getOperand(1);
47439 SDValue FVal = N->getOperand(2);
47440 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47441 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47442 return SDValue();
47443
47444 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47446 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47447 };
47448 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47449 /*CheckBWI*/ false);
47450}
47451
47453 const SDLoc &DL) {
47454 SDValue Cond = N->getOperand(0);
47455 SDValue LHS = N->getOperand(1);
47456 SDValue RHS = N->getOperand(2);
47457
47458 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47459 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47460 if (!TrueC || !FalseC)
47461 return SDValue();
47462
47463 // Don't do this for crazy integer types.
47464 EVT VT = N->getValueType(0);
47465 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47466 return SDValue();
47467
47468 // We're going to use the condition bit in math or logic ops. We could allow
47469 // this with a wider condition value (post-legalization it becomes an i8),
47470 // but if nothing is creating selects that late, it doesn't matter.
47471 if (Cond.getValueType() != MVT::i1)
47472 return SDValue();
47473
47474 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47475 // 3, 5, or 9 with i32/i64, so those get transformed too.
47476 // TODO: For constants that overflow or do not differ by power-of-2 or small
47477 // multiplier, convert to 'and' + 'add'.
47478 const APInt &TrueVal = TrueC->getAPIntValue();
47479 const APInt &FalseVal = FalseC->getAPIntValue();
47480
47481 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47482 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47483 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47484 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47485 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47486 return SDValue();
47487 }
47488
47489 bool OV;
47490 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47491 if (OV)
47492 return SDValue();
47493
47494 APInt AbsDiff = Diff.abs();
47495 if (AbsDiff.isPowerOf2() ||
47496 ((VT == MVT::i32 || VT == MVT::i64) &&
47497 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47498
47499 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47500 // of the condition can usually be folded into a compare predicate, but even
47501 // without that, the sequence should be cheaper than a CMOV alternative.
47502 if (TrueVal.slt(FalseVal)) {
47503 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47504 std::swap(TrueC, FalseC);
47505 }
47506
47507 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47508 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47509
47510 // Multiply condition by the difference if non-one.
47511 if (!AbsDiff.isOne())
47512 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47513
47514 // Add the base if non-zero.
47515 if (!FalseC->isZero())
47516 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47517
47518 return R;
47519 }
47520
47521 return SDValue();
47522}
47523
47524/// If this is a *dynamic* select (non-constant condition) and we can match
47525/// this node with one of the variable blend instructions, restructure the
47526/// condition so that blends can use the high (sign) bit of each element.
47527/// This function will also call SimplifyDemandedBits on already created
47528/// BLENDV to perform additional simplifications.
47530 const SDLoc &DL,
47532 const X86Subtarget &Subtarget) {
47533 SDValue Cond = N->getOperand(0);
47534 if ((N->getOpcode() != ISD::VSELECT &&
47535 N->getOpcode() != X86ISD::BLENDV) ||
47537 return SDValue();
47538
47539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47540 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47541 EVT VT = N->getValueType(0);
47542
47543 // We can only handle the cases where VSELECT is directly legal on the
47544 // subtarget. We custom lower VSELECT nodes with constant conditions and
47545 // this makes it hard to see whether a dynamic VSELECT will correctly
47546 // lower, so we both check the operation's status and explicitly handle the
47547 // cases where a *dynamic* blend will fail even though a constant-condition
47548 // blend could be custom lowered.
47549 // FIXME: We should find a better way to handle this class of problems.
47550 // Potentially, we should combine constant-condition vselect nodes
47551 // pre-legalization into shuffles and not mark as many types as custom
47552 // lowered.
47554 return SDValue();
47555 // FIXME: We don't support i16-element blends currently. We could and
47556 // should support them by making *all* the bits in the condition be set
47557 // rather than just the high bit and using an i8-element blend.
47558 if (VT.getVectorElementType() == MVT::i16)
47559 return SDValue();
47560 // Dynamic blending was only available from SSE4.1 onward.
47561 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47562 return SDValue();
47563 // Byte blends are only available in AVX2
47564 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47565 return SDValue();
47566 // There are no 512-bit blend instructions that use sign bits.
47567 if (VT.is512BitVector())
47568 return SDValue();
47569
47570 // Don't optimize before the condition has been transformed to a legal type
47571 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47573 return SDValue();
47574
47575 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47576 for (SDUse &Use : Cond->uses())
47577 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47578 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47579 Use.getOperandNo() != 0)
47580 return false;
47581
47582 return true;
47583 };
47584
47586
47587 if (OnlyUsedAsSelectCond(Cond)) {
47588 KnownBits Known;
47590 !DCI.isBeforeLegalizeOps());
47591 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47592 return SDValue();
47593
47594 // If we changed the computation somewhere in the DAG, this change will
47595 // affect all users of Cond. Update all the nodes so that we do not use
47596 // the generic VSELECT anymore. Otherwise, we may perform wrong
47597 // optimizations as we messed with the actual expectation for the vector
47598 // boolean values.
47599 for (SDNode *U : Cond->users()) {
47600 if (U->getOpcode() == X86ISD::BLENDV)
47601 continue;
47602
47603 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47604 Cond, U->getOperand(1), U->getOperand(2));
47605 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47606 DCI.AddToWorklist(U);
47607 }
47608 DCI.CommitTargetLoweringOpt(TLO);
47609 return SDValue(N, 0);
47610 }
47611
47612 // Otherwise we can still at least try to simplify multiple use bits.
47614 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47615 N->getOperand(1), N->getOperand(2));
47616
47617 return SDValue();
47618}
47619
47620// Try to match:
47621// (or (and (M, (sub 0, X)), (pandn M, X)))
47622// which is a special case of:
47623// (select M, (sub 0, X), X)
47624// Per:
47625// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47626// We know that, if fNegate is 0 or 1:
47627// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47628//
47629// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47630// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47631// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47632// This lets us transform our vselect to:
47633// (add (xor X, M), (and M, 1))
47634// And further to:
47635// (sub (xor X, M), M)
47637 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47638 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47639 using namespace SDPatternMatch;
47640 EVT MaskVT = Mask.getValueType();
47641 assert(MaskVT.isInteger() &&
47642 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47643 "Mask must be zero/all-bits");
47644
47645 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47647 return SDValue();
47648
47649 SDValue V;
47650 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47652 return SDValue();
47653
47654 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47655 SDValue SubOp2 = Mask;
47656
47657 // If the negate was on the false side of the select, then
47658 // the operands of the SUB need to be swapped. PR 27251.
47659 // This is because the pattern being matched above is
47660 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47661 // but if the pattern matched was
47662 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47663 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47664 // pattern also needs to be a negation of the replacement pattern above.
47665 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47666 // sub accomplishes the negation of the replacement pattern.
47667 if (V == Y)
47668 std::swap(SubOp1, SubOp2);
47669
47670 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47671 return DAG.getBitcast(VT, Res);
47672}
47673
47675 const X86Subtarget &Subtarget) {
47676 using namespace SDPatternMatch;
47677 if (!Subtarget.hasAVX512())
47678 return SDValue();
47679
47680 ISD::CondCode CC;
47681 SDValue Cond, X, Y, LHS, RHS;
47684 m_CondCode(CC)))),
47685 m_Value(LHS), m_Value(RHS))))
47686 return SDValue();
47687
47688 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47689 !canCombineAsMaskOperation(RHS, Subtarget))
47690 return SDValue();
47691
47692 // Commute LHS and RHS to create opportunity to select mask instruction.
47693 // (vselect M, L, R) -> (vselect ~M, R, L)
47694 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47695 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47696 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47697}
47698
47699/// Do target-specific dag combines on SELECT and VSELECT nodes.
47702 const X86Subtarget &Subtarget) {
47703 SDLoc DL(N);
47704 SDValue Cond = N->getOperand(0);
47705 SDValue LHS = N->getOperand(1);
47706 SDValue RHS = N->getOperand(2);
47707
47708 // Try simplification again because we use this function to optimize
47709 // BLENDV nodes that are not handled by the generic combiner.
47710 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47711 return V;
47712
47713 // When avx512 is available the lhs operand of select instruction can be
47714 // folded with mask instruction, while the rhs operand can't. Commute the
47715 // lhs and rhs of the select instruction to create the opportunity of
47716 // folding.
47717 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47718 return V;
47719
47720 EVT VT = LHS.getValueType();
47721 EVT CondVT = Cond.getValueType();
47722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47723 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47724
47725 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47726 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47727 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47728 if (CondVT.isVector() && CondVT.isInteger() &&
47729 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47730 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47733 DL, DAG, Subtarget))
47734 return V;
47735
47736 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47737 SmallVector<int, 64> CondMask;
47738 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47739 N->getOpcode() == X86ISD::BLENDV)) {
47740 // Convert vselects with constant condition into shuffles.
47741 if (DCI.isBeforeLegalizeOps())
47742 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47743
47744 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47745 // by forcing the unselected elements to zero.
47746 // TODO: Can we handle more shuffles with this?
47747 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47748 SmallVector<SDValue, 1> LHSOps, RHSOps;
47749 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47752 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47753 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47754 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47755 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47756 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47757 assert(ByteMask.size() == LHSMask.size() &&
47758 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47759 for (auto [I, M] : enumerate(ByteMask)) {
47760 // getConstVector sets negative shuffle mask values as undef, so
47761 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47762 if (M < (int)ByteMask.size()) {
47763 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47764 RHSMask[I] = 0x80;
47765 } else {
47766 LHSMask[I] = 0x80;
47767 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47768 }
47769 }
47770 MVT ByteVT = LHSShuf.getSimpleValueType();
47771 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47772 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47773 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47774 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47775 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47776 }
47777 }
47778
47779 // Attempt to combine as shuffle.
47780 SDValue Op(N, 0);
47781 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47782 return Res;
47783 }
47784 }
47785
47786 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47787 // instructions match the semantics of the common C idiom x<y?x:y but not
47788 // x<=y?x:y, because of how they handle negative zero (which can be
47789 // ignored in unsafe-math mode).
47790 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47791 if ((Cond.getOpcode() == ISD::SETCC ||
47792 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47793 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47794 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47795 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47796 (Subtarget.hasSSE2() ||
47797 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47798 bool IsStrict = Cond->isStrictFPOpcode();
47799 ISD::CondCode CC =
47800 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47801 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47802 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47803
47804 unsigned Opcode = 0;
47805 // Check for x CC y ? x : y.
47806 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47807 switch (CC) {
47808 default: break;
47809 case ISD::SETULT:
47810 // Converting this to a min would handle NaNs incorrectly, and swapping
47811 // the operands would cause it to handle comparisons between positive
47812 // and negative zero incorrectly.
47813 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47815 !(DAG.isKnownNeverZeroFloat(LHS) ||
47817 break;
47818 std::swap(LHS, RHS);
47819 }
47820 Opcode = X86ISD::FMIN;
47821 break;
47822 case ISD::SETOLE:
47823 // Converting this to a min would handle comparisons between positive
47824 // and negative zero incorrectly.
47827 break;
47828 Opcode = X86ISD::FMIN;
47829 break;
47830 case ISD::SETULE:
47831 // Converting this to a min would handle both negative zeros and NaNs
47832 // incorrectly, but we can swap the operands to fix both.
47833 std::swap(LHS, RHS);
47834 [[fallthrough]];
47835 case ISD::SETOLT:
47836 case ISD::SETLT:
47837 case ISD::SETLE:
47838 Opcode = X86ISD::FMIN;
47839 break;
47840
47841 case ISD::SETOGE:
47842 // Converting this to a max would handle comparisons between positive
47843 // and negative zero incorrectly.
47846 break;
47847 Opcode = X86ISD::FMAX;
47848 break;
47849 case ISD::SETUGT:
47850 // Converting this to a max would handle NaNs incorrectly, and swapping
47851 // the operands would cause it to handle comparisons between positive
47852 // and negative zero incorrectly.
47853 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47855 !(DAG.isKnownNeverZeroFloat(LHS) ||
47857 break;
47858 std::swap(LHS, RHS);
47859 }
47860 Opcode = X86ISD::FMAX;
47861 break;
47862 case ISD::SETUGE:
47863 // Converting this to a max would handle both negative zeros and NaNs
47864 // incorrectly, but we can swap the operands to fix both.
47865 std::swap(LHS, RHS);
47866 [[fallthrough]];
47867 case ISD::SETOGT:
47868 case ISD::SETGT:
47869 case ISD::SETGE:
47870 Opcode = X86ISD::FMAX;
47871 break;
47872 }
47873 // Check for x CC y ? y : x -- a min/max with reversed arms.
47874 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47875 switch (CC) {
47876 default: break;
47877 case ISD::SETOGE:
47878 // Converting this to a min would handle comparisons between positive
47879 // and negative zero incorrectly, and swapping the operands would
47880 // cause it to handle NaNs incorrectly.
47882 !(DAG.isKnownNeverZeroFloat(LHS) ||
47883 DAG.isKnownNeverZeroFloat(RHS))) {
47884 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47885 break;
47886 std::swap(LHS, RHS);
47887 }
47888 Opcode = X86ISD::FMIN;
47889 break;
47890 case ISD::SETUGT:
47891 // Converting this to a min would handle NaNs incorrectly.
47892 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47893 break;
47894 Opcode = X86ISD::FMIN;
47895 break;
47896 case ISD::SETUGE:
47897 // Converting this to a min would handle both negative zeros and NaNs
47898 // incorrectly, but we can swap the operands to fix both.
47899 std::swap(LHS, RHS);
47900 [[fallthrough]];
47901 case ISD::SETOGT:
47902 case ISD::SETGT:
47903 case ISD::SETGE:
47904 Opcode = X86ISD::FMIN;
47905 break;
47906
47907 case ISD::SETULT:
47908 // Converting this to a max would handle NaNs incorrectly.
47909 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47910 break;
47911 Opcode = X86ISD::FMAX;
47912 break;
47913 case ISD::SETOLE:
47914 // Converting this to a max would handle comparisons between positive
47915 // and negative zero incorrectly, and swapping the operands would
47916 // cause it to handle NaNs incorrectly.
47918 !DAG.isKnownNeverZeroFloat(LHS) &&
47919 !DAG.isKnownNeverZeroFloat(RHS)) {
47920 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47921 break;
47922 std::swap(LHS, RHS);
47923 }
47924 Opcode = X86ISD::FMAX;
47925 break;
47926 case ISD::SETULE:
47927 // Converting this to a max would handle both negative zeros and NaNs
47928 // incorrectly, but we can swap the operands to fix both.
47929 std::swap(LHS, RHS);
47930 [[fallthrough]];
47931 case ISD::SETOLT:
47932 case ISD::SETLT:
47933 case ISD::SETLE:
47934 Opcode = X86ISD::FMAX;
47935 break;
47936 }
47937 }
47938
47939 if (Opcode) {
47940 if (IsStrict) {
47941 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47943 DL, {N->getValueType(0), MVT::Other},
47944 {Cond.getOperand(0), LHS, RHS});
47945 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47946 return Ret;
47947 }
47948 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47949 }
47950 }
47951
47952 // Some mask scalar intrinsics rely on checking if only one bit is set
47953 // and implement it in C code like this:
47954 // A[0] = (U & 1) ? A[0] : W[0];
47955 // This creates some redundant instructions that break pattern matching.
47956 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47957 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47958 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47959 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47960 SDValue AndNode = Cond.getOperand(0);
47961 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47962 isNullConstant(Cond.getOperand(1)) &&
47963 isOneConstant(AndNode.getOperand(1))) {
47964 // LHS and RHS swapped due to
47965 // setcc outputting 1 when AND resulted in 0 and vice versa.
47966 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47967 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47968 }
47969 }
47970
47971 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47972 // lowering on KNL. In this case we convert it to
47973 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47974 // The same situation all vectors of i8 and i16 without BWI.
47975 // Make sure we extend these even before type legalization gets a chance to
47976 // split wide vectors.
47977 // Since SKX these selects have a proper lowering.
47978 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47979 CondVT.getVectorElementType() == MVT::i1 &&
47980 (VT.getVectorElementType() == MVT::i8 ||
47981 VT.getVectorElementType() == MVT::i16)) {
47982 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47983 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47984 }
47985
47986 // AVX512 - Extend select to merge with target shuffle.
47987 // select(mask, extract_subvector(shuffle(x)), y) -->
47988 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47989 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47990 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47991 CondVT.getVectorElementType() == MVT::i1) {
47992 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47993 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47994 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47995 isNullConstant(Op.getOperand(1)) &&
47996 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47997 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47998 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47999 ISD::isBuildVectorAllZeros(Alt.getNode()));
48000 };
48001
48002 bool SelectableLHS = SelectableOp(LHS, RHS);
48003 bool SelectableRHS = SelectableOp(RHS, LHS);
48004 if (SelectableLHS || SelectableRHS) {
48005 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48006 : RHS.getOperand(0).getValueType();
48007 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48008 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48009 VT.getSizeInBits());
48010 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48011 VT.getSizeInBits());
48012 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48013 DAG.getUNDEF(SrcCondVT), Cond,
48014 DAG.getVectorIdxConstant(0, DL));
48015 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48016 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48017 }
48018 }
48019
48020 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48021 return V;
48022
48023 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48024 Cond.hasOneUse()) {
48025 EVT CondVT = Cond.getValueType();
48026 SDValue Cond0 = Cond.getOperand(0);
48027 SDValue Cond1 = Cond.getOperand(1);
48028 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48029
48030 // Canonicalize min/max:
48031 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48032 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48033 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48034 // the need for an extra compare against zero. e.g.
48035 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48036 // subl %esi, %edi
48037 // testl %edi, %edi
48038 // movl $0, %eax
48039 // cmovgl %edi, %eax
48040 // =>
48041 // xorl %eax, %eax
48042 // subl %esi, $edi
48043 // cmovsl %eax, %edi
48044 //
48045 // We can also canonicalize
48046 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48047 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48048 // This allows the use of a test instruction for the compare.
48049 if (LHS == Cond0 && RHS == Cond1) {
48050 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48051 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48053 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48054 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48055 }
48056 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48057 ISD::CondCode NewCC = ISD::SETUGE;
48058 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48059 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48060 }
48061 }
48062
48063 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48064 // fold eq + gt/lt nested selects into ge/le selects
48065 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48066 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48067 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48068 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48069 // .. etc ..
48070 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48071 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48072 SDValue InnerSetCC = RHS.getOperand(0);
48073 ISD::CondCode InnerCC =
48074 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48075 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48076 Cond0 == InnerSetCC.getOperand(0) &&
48077 Cond1 == InnerSetCC.getOperand(1)) {
48078 ISD::CondCode NewCC;
48079 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48080 // clang-format off
48081 case ISD::SETGT: NewCC = ISD::SETGE; break;
48082 case ISD::SETLT: NewCC = ISD::SETLE; break;
48083 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48084 case ISD::SETULT: NewCC = ISD::SETULE; break;
48085 default: NewCC = ISD::SETCC_INVALID; break;
48086 // clang-format on
48087 }
48088 if (NewCC != ISD::SETCC_INVALID) {
48089 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48090 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48091 }
48092 }
48093 }
48094 }
48095
48096 // Check if the first operand is all zeros and Cond type is vXi1.
48097 // If this an avx512 target we can improve the use of zero masking by
48098 // swapping the operands and inverting the condition.
48099 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48100 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48101 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48102 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48103 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48104 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48105 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48106 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48107 }
48108
48109 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48110 // get split by legalization.
48111 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48112 CondVT.getVectorElementType() == MVT::i1 &&
48113 TLI.isTypeLegal(VT.getScalarType())) {
48114 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48116 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48117 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48118 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48119 }
48120 }
48121
48122 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48123 // with out-of-bounds clamping.
48124
48125 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48126 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48127 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48128 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48129 // exceeding bitwidth-1.
48130 if (N->getOpcode() == ISD::VSELECT) {
48131 using namespace llvm::SDPatternMatch;
48132 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48133 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48134 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48135 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48137 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48140 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48141 : X86ISD::VSHLV,
48142 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48143 }
48144 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48145 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48146 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48147 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48149 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48152 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48153 : X86ISD::VSHLV,
48154 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48155 }
48156 }
48157
48158 // Early exit check
48159 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48160 return SDValue();
48161
48162 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48163 return V;
48164
48165 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48166 return V;
48167
48168 // select(~Cond, X, Y) -> select(Cond, Y, X)
48169 if (CondVT.getScalarType() != MVT::i1) {
48170 if (SDValue CondNot = IsNOT(Cond, DAG))
48171 return DAG.getNode(N->getOpcode(), DL, VT,
48172 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48173
48174 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48175 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48176 Cond.getOperand(0).getOpcode() == ISD::AND &&
48177 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48178 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48179 Cond.getScalarValueSizeInBits(),
48180 /*AllowUndefs=*/true) &&
48181 Cond.hasOneUse()) {
48182 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48183 Cond.getOperand(0).getOperand(1));
48184 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48185 }
48186
48187 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48188 // signbit.
48189 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48190 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48191 Cond.hasOneUse()) {
48192 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48193 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48194 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48195 }
48196 }
48197
48198 // Try to optimize vXi1 selects if both operands are either all constants or
48199 // bitcasts from scalar integer type. In that case we can convert the operands
48200 // to integer and use an integer select which will be converted to a CMOV.
48201 // We need to take a little bit of care to avoid creating an i64 type after
48202 // type legalization.
48203 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48204 VT.getVectorElementType() == MVT::i1 &&
48205 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48207 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48208 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48209 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48210
48211 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48212 LHS.getOperand(0).getValueType() == IntVT)) &&
48213 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48214 RHS.getOperand(0).getValueType() == IntVT))) {
48215 if (LHSIsConst)
48217 else
48218 LHS = LHS.getOperand(0);
48219
48220 if (RHSIsConst)
48222 else
48223 RHS = RHS.getOperand(0);
48224
48225 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48226 return DAG.getBitcast(VT, Select);
48227 }
48228 }
48229 }
48230
48231 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48232 // single bits, then invert the predicate and swap the select operands.
48233 // This can lower using a vector shift bit-hack rather than mask and compare.
48234 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48235 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48236 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48237 Cond.getOperand(0).getOpcode() == ISD::AND &&
48238 isNullOrNullSplat(Cond.getOperand(1)) &&
48239 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48240 Cond.getOperand(0).getValueType() == VT) {
48241 // The 'and' mask must be composed of power-of-2 constants.
48242 SDValue And = Cond.getOperand(0);
48243 auto *C = isConstOrConstSplat(And.getOperand(1));
48244 if (C && C->getAPIntValue().isPowerOf2()) {
48245 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48246 SDValue NotCond =
48247 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48248 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48249 }
48250
48251 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48252 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48253 // 16-bit lacks a proper blendv.
48254 unsigned EltBitWidth = VT.getScalarSizeInBits();
48255 bool CanShiftBlend =
48256 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48257 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48258 (Subtarget.hasXOP()));
48259 if (CanShiftBlend &&
48260 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48261 return C->getAPIntValue().isPowerOf2();
48262 })) {
48263 // Create a left-shift constant to get the mask bits over to the sign-bit.
48264 SDValue Mask = And.getOperand(1);
48265 SmallVector<int, 32> ShlVals;
48266 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48267 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48268 ShlVals.push_back(EltBitWidth - 1 -
48269 MaskVal->getAPIntValue().exactLogBase2());
48270 }
48271 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48272 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48273 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48274 SDValue NewCond =
48275 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48276 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48277 }
48278 }
48279
48280 return SDValue();
48281}
48282
48283/// Combine:
48284/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48285/// to:
48286/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48287/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48288/// Note that this is only legal for some op/cc combinations.
48290 SelectionDAG &DAG,
48291 const X86Subtarget &Subtarget) {
48292 // This combine only operates on CMP-like nodes.
48293 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48294 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48295 return SDValue();
48296
48297 // Can't replace the cmp if it has more uses than the one we're looking at.
48298 // FIXME: We would like to be able to handle this, but would need to make sure
48299 // all uses were updated.
48300 if (!Cmp.hasOneUse())
48301 return SDValue();
48302
48303 // This only applies to variations of the common case:
48304 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48305 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48306 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48307 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48308 // Using the proper condcodes (see below), overflow is checked for.
48309
48310 // FIXME: We can generalize both constraints:
48311 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48312 // - LHS != 1
48313 // if the result is compared.
48314
48315 SDValue CmpLHS = Cmp.getOperand(0);
48316 SDValue CmpRHS = Cmp.getOperand(1);
48317 EVT CmpVT = CmpLHS.getValueType();
48318
48319 if (!CmpLHS.hasOneUse())
48320 return SDValue();
48321
48322 unsigned Opc = CmpLHS.getOpcode();
48323 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48324 return SDValue();
48325
48326 SDValue OpRHS = CmpLHS.getOperand(2);
48327 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48328 if (!OpRHSC)
48329 return SDValue();
48330
48331 APInt Addend = OpRHSC->getAPIntValue();
48332 if (Opc == ISD::ATOMIC_LOAD_SUB)
48333 Addend = -Addend;
48334
48335 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48336 if (!CmpRHSC)
48337 return SDValue();
48338
48339 APInt Comparison = CmpRHSC->getAPIntValue();
48340 APInt NegAddend = -Addend;
48341
48342 // See if we can adjust the CC to make the comparison match the negated
48343 // addend.
48344 if (Comparison != NegAddend) {
48345 APInt IncComparison = Comparison + 1;
48346 if (IncComparison == NegAddend) {
48347 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48348 Comparison = IncComparison;
48349 CC = X86::COND_AE;
48350 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48351 Comparison = IncComparison;
48352 CC = X86::COND_L;
48353 }
48354 }
48355 APInt DecComparison = Comparison - 1;
48356 if (DecComparison == NegAddend) {
48357 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48358 Comparison = DecComparison;
48359 CC = X86::COND_A;
48360 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48361 Comparison = DecComparison;
48362 CC = X86::COND_LE;
48363 }
48364 }
48365 }
48366
48367 // If the addend is the negation of the comparison value, then we can do
48368 // a full comparison by emitting the atomic arithmetic as a locked sub.
48369 if (Comparison == NegAddend) {
48370 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48371 // atomic sub.
48372 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48373 auto AtomicSub = DAG.getAtomic(
48374 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48375 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48376 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48377 AN->getMemOperand());
48378 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48379 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48380 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48381 return LockOp;
48382 }
48383
48384 // We can handle comparisons with zero in a number of cases by manipulating
48385 // the CC used.
48386 if (!Comparison.isZero())
48387 return SDValue();
48388
48389 if (CC == X86::COND_S && Addend == 1)
48390 CC = X86::COND_LE;
48391 else if (CC == X86::COND_NS && Addend == 1)
48392 CC = X86::COND_G;
48393 else if (CC == X86::COND_G && Addend == -1)
48394 CC = X86::COND_GE;
48395 else if (CC == X86::COND_LE && Addend == -1)
48396 CC = X86::COND_L;
48397 else
48398 return SDValue();
48399
48400 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48401 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48402 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48403 return LockOp;
48404}
48405
48406// Check whether we're just testing the signbit, and whether we can simplify
48407// this by tracking where the signbit came from.
48409 SelectionDAG &DAG) {
48410 if (CC != X86::COND_S && CC != X86::COND_NS)
48411 return SDValue();
48412
48413 if (!Cmp.hasOneUse())
48414 return SDValue();
48415
48416 SDValue Src;
48417 if (Cmp.getOpcode() == X86ISD::CMP) {
48418 // CMP(X,0) -> signbit test
48419 if (!isNullConstant(Cmp.getOperand(1)))
48420 return SDValue();
48421 Src = Cmp.getOperand(0);
48422 // Peek through a SRA node as we just need the signbit.
48423 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48424 // TODO: Use SimplifyDemandedBits instead of just SRA?
48425 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48426 return SDValue();
48427 Src = Src.getOperand(0);
48428 } else if (Cmp.getOpcode() == X86ISD::OR) {
48429 // OR(X,Y) -> see if only one operand contributes to the signbit.
48430 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48431 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48432 Src = Cmp.getOperand(1);
48433 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48434 Src = Cmp.getOperand(0);
48435 else
48436 return SDValue();
48437 } else {
48438 return SDValue();
48439 }
48440
48441 // Replace with a TEST on the MSB.
48442 SDLoc DL(Cmp);
48443 MVT SrcVT = Src.getSimpleValueType();
48444 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48445
48446 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48447 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48448 if (Src.getOpcode() == ISD::SHL) {
48449 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48450 Src = Src.getOperand(0);
48451 BitMask.lshrInPlace(*ShiftAmt);
48452 }
48453 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48454 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48455 Src = Src.getOperand(0);
48456 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48457 }
48458
48459 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48460 DAG.getConstant(BitMask, DL, SrcVT));
48461 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48462 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48463 DAG.getConstant(0, DL, SrcVT));
48464}
48465
48466// Check whether a boolean test is testing a boolean value generated by
48467// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48468// code.
48469//
48470// Simplify the following patterns:
48471// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48472// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48473// to (Op EFLAGS Cond)
48474//
48475// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48476// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48477// to (Op EFLAGS !Cond)
48478//
48479// where Op could be BRCOND or CMOV.
48480//
48482 // This combine only operates on CMP-like nodes.
48483 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48484 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48485 return SDValue();
48486
48487 // Quit if not used as a boolean value.
48488 if (CC != X86::COND_E && CC != X86::COND_NE)
48489 return SDValue();
48490
48491 // Check CMP operands. One of them should be 0 or 1 and the other should be
48492 // an SetCC or extended from it.
48493 SDValue Op1 = Cmp.getOperand(0);
48494 SDValue Op2 = Cmp.getOperand(1);
48495
48496 SDValue SetCC;
48497 const ConstantSDNode* C = nullptr;
48498 bool needOppositeCond = (CC == X86::COND_E);
48499 bool checkAgainstTrue = false; // Is it a comparison against 1?
48500
48501 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48502 SetCC = Op2;
48503 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48504 SetCC = Op1;
48505 else // Quit if all operands are not constants.
48506 return SDValue();
48507
48508 if (C->getZExtValue() == 1) {
48509 needOppositeCond = !needOppositeCond;
48510 checkAgainstTrue = true;
48511 } else if (C->getZExtValue() != 0)
48512 // Quit if the constant is neither 0 or 1.
48513 return SDValue();
48514
48515 bool truncatedToBoolWithAnd = false;
48516 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48517 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48518 SetCC.getOpcode() == ISD::TRUNCATE ||
48519 SetCC.getOpcode() == ISD::AND) {
48520 if (SetCC.getOpcode() == ISD::AND) {
48521 int OpIdx = -1;
48522 if (isOneConstant(SetCC.getOperand(0)))
48523 OpIdx = 1;
48524 if (isOneConstant(SetCC.getOperand(1)))
48525 OpIdx = 0;
48526 if (OpIdx < 0)
48527 break;
48528 SetCC = SetCC.getOperand(OpIdx);
48529 truncatedToBoolWithAnd = true;
48530 } else
48531 SetCC = SetCC.getOperand(0);
48532 }
48533
48534 switch (SetCC.getOpcode()) {
48536 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48537 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48538 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48539 // truncated to i1 using 'and'.
48540 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48541 break;
48543 "Invalid use of SETCC_CARRY!");
48544 [[fallthrough]];
48545 case X86ISD::SETCC:
48546 // Set the condition code or opposite one if necessary.
48547 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48548 if (needOppositeCond)
48550 return SetCC.getOperand(1);
48551 case X86ISD::CMOV: {
48552 // Check whether false/true value has canonical one, i.e. 0 or 1.
48555 // Quit if true value is not a constant.
48556 if (!TVal)
48557 return SDValue();
48558 // Quit if false value is not a constant.
48559 if (!FVal) {
48560 SDValue Op = SetCC.getOperand(0);
48561 // Skip 'zext' or 'trunc' node.
48562 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48563 Op.getOpcode() == ISD::TRUNCATE)
48564 Op = Op.getOperand(0);
48565 // A special case for rdrand/rdseed, where 0 is set if false cond is
48566 // found.
48567 if ((Op.getOpcode() != X86ISD::RDRAND &&
48568 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48569 return SDValue();
48570 }
48571 // Quit if false value is not the constant 0 or 1.
48572 bool FValIsFalse = true;
48573 if (FVal && FVal->getZExtValue() != 0) {
48574 if (FVal->getZExtValue() != 1)
48575 return SDValue();
48576 // If FVal is 1, opposite cond is needed.
48577 needOppositeCond = !needOppositeCond;
48578 FValIsFalse = false;
48579 }
48580 // Quit if TVal is not the constant opposite of FVal.
48581 if (FValIsFalse && TVal->getZExtValue() != 1)
48582 return SDValue();
48583 if (!FValIsFalse && TVal->getZExtValue() != 0)
48584 return SDValue();
48585 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48586 if (needOppositeCond)
48588 return SetCC.getOperand(3);
48589 }
48590 }
48591
48592 return SDValue();
48593}
48594
48595/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48596/// Match:
48597/// (X86or (X86setcc) (X86setcc))
48598/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48600 X86::CondCode &CC1, SDValue &Flags,
48601 bool &isAnd) {
48602 if (Cond->getOpcode() == X86ISD::CMP) {
48603 if (!isNullConstant(Cond->getOperand(1)))
48604 return false;
48605
48606 Cond = Cond->getOperand(0);
48607 }
48608
48609 isAnd = false;
48610
48611 SDValue SetCC0, SetCC1;
48612 switch (Cond->getOpcode()) {
48613 default: return false;
48614 case ISD::AND:
48615 case X86ISD::AND:
48616 isAnd = true;
48617 [[fallthrough]];
48618 case ISD::OR:
48619 case X86ISD::OR:
48620 SetCC0 = Cond->getOperand(0);
48621 SetCC1 = Cond->getOperand(1);
48622 break;
48623 };
48624
48625 // Make sure we have SETCC nodes, using the same flags value.
48626 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48627 SetCC1.getOpcode() != X86ISD::SETCC ||
48628 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48629 return false;
48630
48631 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48632 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48633 Flags = SetCC0->getOperand(1);
48634 return true;
48635}
48636
48637// When legalizing carry, we create carries via add X, -1
48638// If that comes from an actual carry, via setcc, we use the
48639// carry directly.
48641 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48642 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48643 bool FoundAndLSB = false;
48644 SDValue Carry = EFLAGS.getOperand(0);
48645 while (Carry.getOpcode() == ISD::TRUNCATE ||
48646 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48647 (Carry.getOpcode() == ISD::AND &&
48648 isOneConstant(Carry.getOperand(1)))) {
48649 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48650 Carry = Carry.getOperand(0);
48651 }
48652 if (Carry.getOpcode() == X86ISD::SETCC ||
48653 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48654 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48655 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48656 SDValue CarryOp1 = Carry.getOperand(1);
48657 if (CarryCC == X86::COND_B)
48658 return CarryOp1;
48659 if (CarryCC == X86::COND_A) {
48660 // Try to convert COND_A into COND_B in an attempt to facilitate
48661 // materializing "setb reg".
48662 //
48663 // Do not flip "e > c", where "c" is a constant, because Cmp
48664 // instruction cannot take an immediate as its first operand.
48665 //
48666 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48667 CarryOp1.getNode()->hasOneUse() &&
48668 CarryOp1.getValueType().isInteger() &&
48669 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48670 SDValue SubCommute =
48671 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48672 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48673 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48674 }
48675 }
48676 // If this is a check of the z flag of an add with 1, switch to the
48677 // C flag.
48678 if (CarryCC == X86::COND_E &&
48679 CarryOp1.getOpcode() == X86ISD::ADD &&
48680 isOneConstant(CarryOp1.getOperand(1)))
48681 return CarryOp1;
48682 } else if (FoundAndLSB) {
48683 SDLoc DL(Carry);
48684 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48685 if (Carry.getOpcode() == ISD::SRL) {
48686 BitNo = Carry.getOperand(1);
48687 Carry = Carry.getOperand(0);
48688 }
48689 return getBT(Carry, BitNo, DL, DAG);
48690 }
48691 }
48692 }
48693
48694 return SDValue();
48695}
48696
48697/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48698/// to avoid the inversion.
48700 SelectionDAG &DAG,
48701 const X86Subtarget &Subtarget) {
48702 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48703 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48704 EFLAGS.getOpcode() != X86ISD::TESTP)
48705 return SDValue();
48706
48707 // PTEST/TESTP sets EFLAGS as:
48708 // TESTZ: ZF = (Op0 & Op1) == 0
48709 // TESTC: CF = (~Op0 & Op1) == 0
48710 // TESTNZC: ZF == 0 && CF == 0
48711 MVT VT = EFLAGS.getSimpleValueType();
48712 SDValue Op0 = EFLAGS.getOperand(0);
48713 SDValue Op1 = EFLAGS.getOperand(1);
48714 MVT OpVT = Op0.getSimpleValueType();
48715 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48716
48717 // TEST*(~X,Y) == TEST*(X,Y)
48718 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48719 X86::CondCode InvCC;
48720 switch (CC) {
48721 case X86::COND_B:
48722 // testc -> testz.
48723 InvCC = X86::COND_E;
48724 break;
48725 case X86::COND_AE:
48726 // !testc -> !testz.
48727 InvCC = X86::COND_NE;
48728 break;
48729 case X86::COND_E:
48730 // testz -> testc.
48731 InvCC = X86::COND_B;
48732 break;
48733 case X86::COND_NE:
48734 // !testz -> !testc.
48735 InvCC = X86::COND_AE;
48736 break;
48737 case X86::COND_A:
48738 case X86::COND_BE:
48739 // testnzc -> testnzc (no change).
48740 InvCC = CC;
48741 break;
48742 default:
48743 InvCC = X86::COND_INVALID;
48744 break;
48745 }
48746
48747 if (InvCC != X86::COND_INVALID) {
48748 CC = InvCC;
48749 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48750 DAG.getBitcast(OpVT, NotOp0), Op1);
48751 }
48752 }
48753
48754 if (CC == X86::COND_B || CC == X86::COND_AE) {
48755 // TESTC(X,~X) == TESTC(X,-1)
48756 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48757 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48758 SDLoc DL(EFLAGS);
48759 return DAG.getNode(
48760 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48761 DAG.getBitcast(OpVT,
48762 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48763 }
48764 }
48765 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48766 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48768 SDValue BC0 = peekThroughBitcasts(Op0);
48769 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48771 SDLoc DL(EFLAGS);
48772 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48773 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48774 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48775 }
48776 }
48777 }
48778
48779 if (CC == X86::COND_E || CC == X86::COND_NE) {
48780 // TESTZ(X,~Y) == TESTC(Y,X)
48781 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48782 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48783 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48784 DAG.getBitcast(OpVT, NotOp1), Op0);
48785 }
48786
48787 if (Op0 == Op1) {
48788 SDValue BC = peekThroughBitcasts(Op0);
48789 EVT BCVT = BC.getValueType();
48790
48791 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48792 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48793 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48794 DAG.getBitcast(OpVT, BC.getOperand(0)),
48795 DAG.getBitcast(OpVT, BC.getOperand(1)));
48796 }
48797
48798 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48799 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48800 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48801 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48802 DAG.getBitcast(OpVT, BC.getOperand(0)),
48803 DAG.getBitcast(OpVT, BC.getOperand(1)));
48804 }
48805
48806 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48807 // to more efficiently extract the sign bits and compare that.
48808 // TODO: Handle TESTC with comparison inversion.
48809 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48810 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48811 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48812 unsigned EltBits = BCVT.getScalarSizeInBits();
48813 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48814 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48815 APInt SignMask = APInt::getSignMask(EltBits);
48816 if (SDValue Res =
48817 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48818 // For vXi16 cases we need to use pmovmksb and extract every other
48819 // sign bit.
48820 SDLoc DL(EFLAGS);
48821 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48822 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48823 MVT FloatVT =
48824 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48825 Res = DAG.getBitcast(FloatVT, Res);
48826 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48827 } else if (EltBits == 16) {
48828 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48829 Res = DAG.getBitcast(MovmskVT, Res);
48830 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48831 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48832 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48833 } else {
48834 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48835 }
48836 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48837 DAG.getConstant(0, DL, MVT::i32));
48838 }
48839 }
48840 }
48841 }
48842
48843 // TESTZ(-1,X) == TESTZ(X,X)
48845 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48846
48847 // TESTZ(X,-1) == TESTZ(X,X)
48849 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48850
48851 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48852 // TODO: Add COND_NE handling?
48853 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48854 SDValue Src0 = peekThroughBitcasts(Op0);
48855 SDValue Src1 = peekThroughBitcasts(Op1);
48856 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48858 peekThroughBitcasts(Src0.getOperand(1)), true);
48860 peekThroughBitcasts(Src1.getOperand(1)), true);
48861 if (Src0 && Src1) {
48862 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48863 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48864 DAG.getBitcast(OpVT2, Src0),
48865 DAG.getBitcast(OpVT2, Src1));
48866 }
48867 }
48868 }
48869 }
48870
48871 return SDValue();
48872}
48873
48874// Attempt to simplify the MOVMSK input based on the comparison type.
48876 SelectionDAG &DAG,
48877 const X86Subtarget &Subtarget) {
48878 // Handle eq/ne against zero (any_of).
48879 // Handle eq/ne against -1 (all_of).
48880 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48881 return SDValue();
48882 if (EFLAGS.getValueType() != MVT::i32)
48883 return SDValue();
48884 unsigned CmpOpcode = EFLAGS.getOpcode();
48885 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48886 return SDValue();
48887 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48888 if (!CmpConstant)
48889 return SDValue();
48890 const APInt &CmpVal = CmpConstant->getAPIntValue();
48891
48892 SDValue CmpOp = EFLAGS.getOperand(0);
48893 unsigned CmpBits = CmpOp.getValueSizeInBits();
48894 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48895
48896 // Peek through any truncate.
48897 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48898 CmpOp = CmpOp.getOperand(0);
48899
48900 // Bail if we don't find a MOVMSK.
48901 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48902 return SDValue();
48903
48904 SDValue Vec = CmpOp.getOperand(0);
48905 MVT VecVT = Vec.getSimpleValueType();
48906 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48907 "Unexpected MOVMSK operand");
48908 unsigned NumElts = VecVT.getVectorNumElements();
48909 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48910
48911 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48912 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48913 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48914 if (!IsAnyOf && !IsAllOf)
48915 return SDValue();
48916
48917 // TODO: Check more combining cases for me.
48918 // Here we check the cmp use number to decide do combining or not.
48919 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48920 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48921 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48922
48923 // See if we can peek through to a vector with a wider element type, if the
48924 // signbits extend down to all the sub-elements as well.
48925 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48926 // potential SimplifyDemandedBits/Elts cases.
48927 // If we looked through a truncate that discard bits, we can't do this
48928 // transform.
48929 // FIXME: We could do this transform for truncates that discarded bits by
48930 // inserting an AND mask between the new MOVMSK and the CMP.
48931 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48932 SDValue BC = peekThroughBitcasts(Vec);
48933 MVT BCVT = BC.getSimpleValueType();
48934 unsigned BCNumElts = BCVT.getVectorNumElements();
48935 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48936 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48937 BCNumEltBits > NumEltBits &&
48938 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48939 SDLoc DL(EFLAGS);
48940 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48941 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48942 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48943 DAG.getConstant(CmpMask, DL, MVT::i32));
48944 }
48945 }
48946
48947 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48948 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48949 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48950 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48951 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48953 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48954 Ops.size() == 2) {
48955 SDLoc DL(EFLAGS);
48956 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48957 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48958 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48959 DAG.getBitcast(SubVT, Ops[0]),
48960 DAG.getBitcast(SubVT, Ops[1]));
48961 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48962 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48963 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48964 DAG.getConstant(CmpMask, DL, MVT::i32));
48965 }
48966 }
48967
48968 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48969 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48970 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48971 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48972 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48973 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48974 SDValue BC = peekThroughBitcasts(Vec);
48975 // Ensure MOVMSK was testing every signbit of BC.
48976 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48977 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48978 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48979 BC.getOperand(0), BC.getOperand(1));
48980 V = DAG.getBitcast(TestVT, V);
48981 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48982 }
48983 // Check for 256-bit split vector cases.
48984 if (BC.getOpcode() == ISD::AND &&
48985 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48986 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48987 SDValue LHS = BC.getOperand(0);
48988 SDValue RHS = BC.getOperand(1);
48989 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48990 LHS.getOperand(0), LHS.getOperand(1));
48991 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48992 RHS.getOperand(0), RHS.getOperand(1));
48993 LHS = DAG.getBitcast(TestVT, LHS);
48994 RHS = DAG.getBitcast(TestVT, RHS);
48995 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48996 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48997 }
48998 }
48999 }
49000
49001 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49002 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49003 // sign bits prior to the comparison with zero unless we know that
49004 // the vXi16 splats the sign bit down to the lower i8 half.
49005 // TODO: Handle all_of patterns.
49006 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49007 SDValue VecOp0 = Vec.getOperand(0);
49008 SDValue VecOp1 = Vec.getOperand(1);
49009 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49010 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49011 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49012 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49013 SDLoc DL(EFLAGS);
49014 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49015 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49016 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49017 if (!SignExt0) {
49018 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49019 DAG.getConstant(0xAAAA, DL, MVT::i16));
49020 }
49021 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49022 DAG.getConstant(0, DL, MVT::i16));
49023 }
49024 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49025 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49026 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49027 (IsAnyOf || (SignExt0 && SignExt1))) {
49028 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49029 SDLoc DL(EFLAGS);
49030 SDValue Result = peekThroughBitcasts(Src);
49031 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49032 Result.getValueType().getVectorNumElements() <= NumElts) {
49033 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49034 Result.getOperand(0), Result.getOperand(1));
49035 V = DAG.getBitcast(MVT::v4i64, V);
49036 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49037 }
49038 Result = DAG.getBitcast(MVT::v32i8, Result);
49039 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49040 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49041 if (!SignExt0 || !SignExt1) {
49042 assert(IsAnyOf &&
49043 "Only perform v16i16 signmasks for any_of patterns");
49044 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49045 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49046 }
49047 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49048 DAG.getConstant(CmpMask, DL, MVT::i32));
49049 }
49050 }
49051 }
49052
49053 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49054 // Since we peek through a bitcast, we need to be careful if the base vector
49055 // type has smaller elements than the MOVMSK type. In that case, even if
49056 // all the elements are demanded by the shuffle mask, only the "high"
49057 // elements which have highbits that align with highbits in the MOVMSK vec
49058 // elements are actually demanded. A simplification of spurious operations
49059 // on the "low" elements take place during other simplifications.
49060 //
49061 // For example:
49062 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49063 // demanded, because we are swapping around the result can change.
49064 //
49065 // To address this, we check that we can scale the shuffle mask to MOVMSK
49066 // element width (this will ensure "high" elements match). Its slightly overly
49067 // conservative, but fine for an edge case fold.
49068 SmallVector<int, 32> ShuffleMask;
49069 SmallVector<SDValue, 2> ShuffleInputs;
49070 if (NumElts <= CmpBits &&
49071 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49072 ShuffleMask, DAG) &&
49073 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49074 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49075 canScaleShuffleElements(ShuffleMask, NumElts)) {
49076 SDLoc DL(EFLAGS);
49077 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49078 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49079 Result =
49080 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49081 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49082 }
49083
49084 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49085 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49086 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49087 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49088 // iff every element is referenced.
49089 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49090 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49091 (NumEltBits == 32 || NumEltBits == 64)) {
49092 SDLoc DL(EFLAGS);
49093 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49094 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49095 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49096 SDValue LHS = Vec;
49097 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49098 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49099 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49100 DAG.getBitcast(FloatVT, LHS),
49101 DAG.getBitcast(FloatVT, RHS));
49102 }
49103
49104 return SDValue();
49105}
49106
49107/// Optimize an EFLAGS definition used according to the condition code \p CC
49108/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49109/// uses of chain values.
49111 SelectionDAG &DAG,
49112 const X86Subtarget &Subtarget) {
49113 if (CC == X86::COND_B)
49114 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49115 return Flags;
49116
49117 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49118 return R;
49119
49120 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49121 return R;
49122
49123 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49124 return R;
49125
49126 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49127 return R;
49128
49129 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49130}
49131
49132/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49135 const X86Subtarget &Subtarget) {
49136 SDLoc DL(N);
49137 EVT VT = N->getValueType(0);
49138 SDValue FalseOp = N->getOperand(0);
49139 SDValue TrueOp = N->getOperand(1);
49140 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49141 SDValue Cond = N->getOperand(3);
49142
49143 // cmov X, X, ?, ? --> X
49144 if (TrueOp == FalseOp)
49145 return TrueOp;
49146
49147 // Try to simplify the EFLAGS and condition code operands.
49148 // We can't always do this as FCMOV only supports a subset of X86 cond.
49149 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49150 if (!(FalseOp.getValueType() == MVT::f80 ||
49151 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49152 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49153 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49154 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49155 Flags};
49156 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49157 }
49158 }
49159
49160 // If this is a select between two integer constants, try to do some
49161 // optimizations. Note that the operands are ordered the opposite of SELECT
49162 // operands.
49163 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49164 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49165 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49166 // larger than FalseC (the false value).
49167 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49169 std::swap(TrueC, FalseC);
49170 std::swap(TrueOp, FalseOp);
49171 }
49172
49173 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49174 // This is efficient for any integer data type (including i8/i16) and
49175 // shift amount.
49176 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49177 Cond = getSETCC(CC, Cond, DL, DAG);
49178
49179 // Zero extend the condition if needed.
49180 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49181
49182 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49183 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49184 DAG.getConstant(ShAmt, DL, MVT::i8));
49185 return Cond;
49186 }
49187
49188 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49189 // for any integer data type, including i8/i16.
49190 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49191 Cond = getSETCC(CC, Cond, DL, DAG);
49192
49193 // Zero extend the condition if needed.
49195 FalseC->getValueType(0), Cond);
49196 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49197 SDValue(FalseC, 0));
49198 return Cond;
49199 }
49200
49201 // Optimize cases that will turn into an LEA instruction. This requires
49202 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49203 if (VT == MVT::i32 || VT == MVT::i64) {
49204 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49205 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49206 "Implicit constant truncation");
49207
49208 bool isFastMultiplier = false;
49209 if (Diff.ult(10)) {
49210 switch (Diff.getZExtValue()) {
49211 default: break;
49212 case 1: // result = add base, cond
49213 case 2: // result = lea base( , cond*2)
49214 case 3: // result = lea base(cond, cond*2)
49215 case 4: // result = lea base( , cond*4)
49216 case 5: // result = lea base(cond, cond*4)
49217 case 8: // result = lea base( , cond*8)
49218 case 9: // result = lea base(cond, cond*8)
49219 isFastMultiplier = true;
49220 break;
49221 }
49222 }
49223
49224 if (isFastMultiplier) {
49225 Cond = getSETCC(CC, Cond, DL ,DAG);
49226 // Zero extend the condition if needed.
49227 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49228 Cond);
49229 // Scale the condition by the difference.
49230 if (Diff != 1)
49231 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49232 DAG.getConstant(Diff, DL, Cond.getValueType()));
49233
49234 // Add the base if non-zero.
49235 if (FalseC->getAPIntValue() != 0)
49236 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49237 SDValue(FalseC, 0));
49238 return Cond;
49239 }
49240 }
49241 }
49242 }
49243
49244 // Handle these cases:
49245 // (select (x != c), e, c) -> select (x != c), e, x),
49246 // (select (x == c), c, e) -> select (x == c), x, e)
49247 // where the c is an integer constant, and the "select" is the combination
49248 // of CMOV and CMP.
49249 //
49250 // The rationale for this change is that the conditional-move from a constant
49251 // needs two instructions, however, conditional-move from a register needs
49252 // only one instruction.
49253 //
49254 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49255 // some instruction-combining opportunities. This opt needs to be
49256 // postponed as late as possible.
49257 //
49258 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49259 // the DCI.xxxx conditions are provided to postpone the optimization as
49260 // late as possible.
49261
49262 ConstantSDNode *CmpAgainst = nullptr;
49263 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49264 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49265 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49266
49267 if (CC == X86::COND_NE &&
49268 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49270 std::swap(TrueOp, FalseOp);
49271 }
49272
49273 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49274 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49275 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49276 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49277 }
49278 }
49279 }
49280
49281 // Transform:
49282 //
49283 // (cmov 1 T (uge T 2))
49284 //
49285 // to:
49286 //
49287 // (adc T 0 (sub T 1))
49288 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49289 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49290 SDValue Cond0 = Cond.getOperand(0);
49291 if (Cond0.getOpcode() == ISD::TRUNCATE)
49292 Cond0 = Cond0.getOperand(0);
49293 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49294 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49295 EVT CondVT = Cond->getValueType(0);
49296 // Subtract 1 and generate a carry.
49297 SDValue NewSub =
49298 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49299 DAG.getConstant(1, DL, CondVT));
49300 SDValue EFLAGS(NewSub.getNode(), 1);
49301 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49302 DAG.getConstant(0, DL, VT), EFLAGS);
49303 }
49304 }
49305
49306 // Fold and/or of setcc's to double CMOV:
49307 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49308 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49309 //
49310 // This combine lets us generate:
49311 // cmovcc1 (jcc1 if we don't have CMOV)
49312 // cmovcc2 (same)
49313 // instead of:
49314 // setcc1
49315 // setcc2
49316 // and/or
49317 // cmovne (jne if we don't have CMOV)
49318 // When we can't use the CMOV instruction, it might increase branch
49319 // mispredicts.
49320 // When we can use CMOV, or when there is no mispredict, this improves
49321 // throughput and reduces register pressure.
49322 //
49323 if (CC == X86::COND_NE) {
49324 SDValue Flags;
49325 X86::CondCode CC0, CC1;
49326 bool isAndSetCC;
49327 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49328 if (isAndSetCC) {
49329 std::swap(FalseOp, TrueOp);
49332 }
49333
49334 SDValue LOps[] = {FalseOp, TrueOp,
49335 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49336 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49337 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49338 Flags};
49339 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49340 return CMOV;
49341 }
49342 }
49343
49344 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49345 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49346 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49347 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49348 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49349 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49350 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49351 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49352 SDValue Add = TrueOp;
49353 SDValue Const = FalseOp;
49354 // Canonicalize the condition code for easier matching and output.
49355 if (CC == X86::COND_E)
49356 std::swap(Add, Const);
49357
49358 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49359 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49360 Add.getResNo() == 0 && Add.hasOneUse() &&
49361 Add.getOperand(1) == Cond.getOperand(0)) {
49362 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49363 Add.getOperand(1));
49364 }
49365
49366 // We might have replaced the constant in the cmov with the LHS of the
49367 // compare. If so change it to the RHS of the compare.
49368 if (Const == Cond.getOperand(0))
49369 Const = Cond.getOperand(1);
49370
49371 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49372 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49373 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49374 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49375 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49376 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49377 // This should constant fold.
49378 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49379 SDValue CMov =
49380 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49381 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49382 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49383 }
49384 }
49385
49386 return SDValue();
49387}
49388
49389/// Different mul shrinking modes.
49391
49393 EVT VT = N->getOperand(0).getValueType();
49394 if (VT.getScalarSizeInBits() != 32)
49395 return false;
49396
49397 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49398 unsigned SignBits[2] = {1, 1};
49399 bool IsPositive[2] = {false, false};
49400 for (unsigned i = 0; i < 2; i++) {
49401 SDValue Opd = N->getOperand(i);
49402
49403 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49404 IsPositive[i] = DAG.SignBitIsZero(Opd);
49405 }
49406
49407 bool AllPositive = IsPositive[0] && IsPositive[1];
49408 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49409 // When ranges are from -128 ~ 127, use MULS8 mode.
49410 if (MinSignBits >= 25)
49412 // When ranges are from 0 ~ 255, use MULU8 mode.
49413 else if (AllPositive && MinSignBits >= 24)
49415 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49416 else if (MinSignBits >= 17)
49418 // When ranges are from 0 ~ 65535, use MULU16 mode.
49419 else if (AllPositive && MinSignBits >= 16)
49421 else
49422 return false;
49423 return true;
49424}
49425
49426/// When the operands of vector mul are extended from smaller size values,
49427/// like i8 and i16, the type of mul may be shrinked to generate more
49428/// efficient code. Two typical patterns are handled:
49429/// Pattern1:
49430/// %2 = sext/zext <N x i8> %1 to <N x i32>
49431/// %4 = sext/zext <N x i8> %3 to <N x i32>
49432// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49433/// %5 = mul <N x i32> %2, %4
49434///
49435/// Pattern2:
49436/// %2 = zext/sext <N x i16> %1 to <N x i32>
49437/// %4 = zext/sext <N x i16> %3 to <N x i32>
49438/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49439/// %5 = mul <N x i32> %2, %4
49440///
49441/// There are four mul shrinking modes:
49442/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49443/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49444/// generate pmullw+sext32 for it (MULS8 mode).
49445/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49446/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49447/// generate pmullw+zext32 for it (MULU8 mode).
49448/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49449/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49450/// generate pmullw+pmulhw for it (MULS16 mode).
49451/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49452/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49453/// generate pmullw+pmulhuw for it (MULU16 mode).
49455 const X86Subtarget &Subtarget) {
49456 // Check for legality
49457 // pmullw/pmulhw are not supported by SSE.
49458 if (!Subtarget.hasSSE2())
49459 return SDValue();
49460
49461 // Check for profitability
49462 // pmulld is supported since SSE41. It is better to use pmulld
49463 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49464 // the expansion.
49465 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49466 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49467 return SDValue();
49468
49470 if (!canReduceVMulWidth(N, DAG, Mode))
49471 return SDValue();
49472
49473 SDValue N0 = N->getOperand(0);
49474 SDValue N1 = N->getOperand(1);
49475 EVT VT = N->getOperand(0).getValueType();
49476 unsigned NumElts = VT.getVectorNumElements();
49477 if ((NumElts % 2) != 0)
49478 return SDValue();
49479
49480 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49481
49482 // Shrink the operands of mul.
49483 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49484 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49485
49486 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49487 // lower part is needed.
49488 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49492 DL, VT, MulLo);
49493
49494 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49495 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49496 // the higher part is also needed.
49497 SDValue MulHi =
49499 ReducedVT, NewN0, NewN1);
49500
49501 // Repack the lower part and higher part result of mul into a wider
49502 // result.
49503 // Generate shuffle functioning as punpcklwd.
49504 SmallVector<int, 16> ShuffleMask(NumElts);
49505 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49506 ShuffleMask[2 * i] = i;
49507 ShuffleMask[2 * i + 1] = i + NumElts;
49508 }
49509 SDValue ResLo =
49510 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49511 ResLo = DAG.getBitcast(ResVT, ResLo);
49512 // Generate shuffle functioning as punpckhwd.
49513 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49514 ShuffleMask[2 * i] = i + NumElts / 2;
49515 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49516 }
49517 SDValue ResHi =
49518 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49519 ResHi = DAG.getBitcast(ResVT, ResHi);
49520 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49521}
49522
49524 EVT VT, const SDLoc &DL) {
49525
49526 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49527 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49528 DAG.getConstant(Mult, DL, VT));
49529 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49530 DAG.getConstant(Shift, DL, MVT::i8));
49531 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49532 N->getOperand(0));
49533 return Result;
49534 };
49535
49536 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49537 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49538 DAG.getConstant(Mul1, DL, VT));
49539 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49540 DAG.getConstant(Mul2, DL, VT));
49541 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49542 N->getOperand(0));
49543 return Result;
49544 };
49545
49546 switch (MulAmt) {
49547 default:
49548 break;
49549 case 11:
49550 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49551 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49552 case 21:
49553 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49554 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49555 case 41:
49556 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49557 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49558 case 22:
49559 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49560 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49561 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49562 case 19:
49563 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49564 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49565 case 37:
49566 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49567 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49568 case 73:
49569 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49570 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49571 case 13:
49572 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49573 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49574 case 23:
49575 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49576 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49577 case 26:
49578 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49579 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49580 case 28:
49581 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49582 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49583 case 29:
49584 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49585 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49586 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49587 }
49588
49589 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49590 // by a single LEA.
49591 // First check if this a sum of two power of 2s because that's easy. Then
49592 // count how many zeros are up to the first bit.
49593 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49594 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49595 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49596 if (ScaleShift >= 1 && ScaleShift < 4) {
49597 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49598 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49599 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49600 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49601 DAG.getConstant(ScaleShift, DL, MVT::i8));
49602 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49603 }
49604 }
49605
49606 return SDValue();
49607}
49608
49609// If the upper 17 bits of either element are zero and the other element are
49610// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49611// PMULLD, except on KNL.
49613 SelectionDAG &DAG,
49614 const X86Subtarget &Subtarget) {
49615 if (!Subtarget.hasSSE2())
49616 return SDValue();
49617
49618 if (Subtarget.isPMADDWDSlow())
49619 return SDValue();
49620
49621 EVT VT = N->getValueType(0);
49622
49623 // Only support vXi32 vectors.
49624 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49625 return SDValue();
49626
49627 // Make sure the type is legal or can split/widen to a legal type.
49628 // With AVX512 but without BWI, we would need to split v32i16.
49629 unsigned NumElts = VT.getVectorNumElements();
49630 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49631 return SDValue();
49632
49633 // With AVX512 but without BWI, we would need to split v32i16.
49634 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49635 return SDValue();
49636
49637 SDValue N0 = N->getOperand(0);
49638 SDValue N1 = N->getOperand(1);
49639
49640 // If we are zero/sign extending two steps without SSE4.1, its better to
49641 // reduce the vmul width instead.
49642 if (!Subtarget.hasSSE41() &&
49643 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49644 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49645 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49646 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49647 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49648 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49649 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49650 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49651 return SDValue();
49652
49653 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49654 // the vmul width instead.
49655 if (!Subtarget.hasSSE41() &&
49656 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49657 N0.getOperand(0).getValueSizeInBits() > 128) &&
49658 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49659 N1.getOperand(0).getValueSizeInBits() > 128))
49660 return SDValue();
49661
49662 // Sign bits must extend down to the lowest i16.
49663 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49664 DAG.ComputeMaxSignificantBits(N0) > 16)
49665 return SDValue();
49666
49667 // At least one of the elements must be zero in the upper 17 bits, or can be
49668 // safely made zero without altering the final result.
49669 auto GetZeroableOp = [&](SDValue Op) {
49670 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49671 if (DAG.MaskedValueIsZero(Op, Mask17))
49672 return Op;
49673 // Mask off upper 16-bits of sign-extended constants.
49675 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49676 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49677 SDValue Src = Op.getOperand(0);
49678 // Convert sext(vXi16) to zext(vXi16).
49679 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49680 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49681 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49682 // which will expand the extension.
49683 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49684 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49685 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49686 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49687 }
49688 }
49689 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49690 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49691 N->isOnlyUserOf(Op.getNode())) {
49692 SDValue Src = Op.getOperand(0);
49693 if (Src.getScalarValueSizeInBits() == 16)
49694 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49695 }
49696 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49697 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49698 N->isOnlyUserOf(Op.getNode())) {
49699 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49700 Op.getOperand(1));
49701 }
49702 return SDValue();
49703 };
49704 SDValue ZeroN0 = GetZeroableOp(N0);
49705 SDValue ZeroN1 = GetZeroableOp(N1);
49706 if (!ZeroN0 && !ZeroN1)
49707 return SDValue();
49708 N0 = ZeroN0 ? ZeroN0 : N0;
49709 N1 = ZeroN1 ? ZeroN1 : N1;
49710
49711 // Use SplitOpsAndApply to handle AVX splitting.
49712 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49714 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49715 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49716 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49717 DAG.getBitcast(OpVT, Ops[0]),
49718 DAG.getBitcast(OpVT, Ops[1]));
49719 };
49720 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49721}
49722
49724 const X86Subtarget &Subtarget) {
49725 if (!Subtarget.hasSSE2())
49726 return SDValue();
49727
49728 EVT VT = N->getValueType(0);
49729
49730 // Only support vXi64 vectors.
49731 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49732 VT.getVectorNumElements() < 2 ||
49734 return SDValue();
49735
49736 SDValue N0 = N->getOperand(0);
49737 SDValue N1 = N->getOperand(1);
49738
49739 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49740 // 32-bits. We can lower with this if the sign bits stretch that far.
49741 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49742 DAG.ComputeNumSignBits(N1) > 32) {
49743 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49745 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49746 };
49747 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49748 /*CheckBWI*/ false);
49749 }
49750
49751 // If the upper bits are zero we can use a single pmuludq.
49752 APInt Mask = APInt::getHighBitsSet(64, 32);
49753 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49754 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49756 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49757 };
49758 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49759 /*CheckBWI*/ false);
49760 }
49761
49762 return SDValue();
49763}
49764
49767 const X86Subtarget &Subtarget) {
49768 EVT VT = N->getValueType(0);
49769 SDLoc DL(N);
49770
49771 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49772 return V;
49773
49774 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49775 return V;
49776
49777 if (DCI.isBeforeLegalize() && VT.isVector())
49778 return reduceVMULWidth(N, DL, DAG, Subtarget);
49779
49780 if (VT != MVT::i64 && VT != MVT::i32 &&
49781 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49782 return SDValue();
49783
49784 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49785 if (!Known1.isConstant())
49786 return SDValue();
49787
49788 const APInt &C = Known1.getConstant();
49789 if (C.isZero())
49790 return DAG.getConstant(0, DL, VT);
49791
49792 if (C.isAllOnes())
49793 return DAG.getNegative(N->getOperand(0), DL, VT);
49794
49795 if (isPowerOf2_64(C.getZExtValue()))
49796 return SDValue();
49797
49798 // Optimize a single multiply with constant into two operations in order to
49799 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49801 return SDValue();
49802
49803 // An imul is usually smaller than the alternative sequence.
49805 return SDValue();
49806
49807 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49808 return SDValue();
49809
49810 int64_t SignMulAmt = C.getSExtValue();
49811 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49812 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49813
49814 SDValue NewMul = SDValue();
49815 if (VT == MVT::i64 || VT == MVT::i32) {
49816 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49817 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49818 DAG.getConstant(AbsMulAmt, DL, VT));
49819 if (SignMulAmt < 0)
49820 NewMul = DAG.getNegative(NewMul, DL, VT);
49821
49822 return NewMul;
49823 }
49824
49825 uint64_t MulAmt1 = 0;
49826 uint64_t MulAmt2 = 0;
49827 if ((AbsMulAmt % 9) == 0) {
49828 MulAmt1 = 9;
49829 MulAmt2 = AbsMulAmt / 9;
49830 } else if ((AbsMulAmt % 5) == 0) {
49831 MulAmt1 = 5;
49832 MulAmt2 = AbsMulAmt / 5;
49833 } else if ((AbsMulAmt % 3) == 0) {
49834 MulAmt1 = 3;
49835 MulAmt2 = AbsMulAmt / 3;
49836 }
49837
49838 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49839 if (MulAmt2 &&
49840 (isPowerOf2_64(MulAmt2) ||
49841 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49842
49843 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49844 N->user_begin()->getOpcode() == ISD::ADD))
49845 // If second multiplifer is pow2, issue it first. We want the multiply
49846 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49847 // use is an add. Only do this for positive multiply amounts since the
49848 // negate would prevent it from being used as an address mode anyway.
49849 std::swap(MulAmt1, MulAmt2);
49850
49851 if (isPowerOf2_64(MulAmt1))
49852 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49853 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49854 else
49855 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49856 DAG.getConstant(MulAmt1, DL, VT));
49857
49858 if (isPowerOf2_64(MulAmt2))
49859 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49860 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49861 else
49862 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49863 DAG.getConstant(MulAmt2, DL, VT));
49864
49865 // Negate the result.
49866 if (SignMulAmt < 0)
49867 NewMul = DAG.getNegative(NewMul, DL, VT);
49868 } else if (!Subtarget.slowLEA())
49869 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49870 }
49871 if (!NewMul) {
49872 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49873 if (isPowerOf2_64(AbsMulAmt - 1)) {
49874 // (mul x, 2^N + 1) => (add (shl x, N), x)
49875 NewMul = DAG.getNode(
49876 ISD::ADD, DL, VT, N->getOperand(0),
49877 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49878 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49879 if (SignMulAmt < 0)
49880 NewMul = DAG.getNegative(NewMul, DL, VT);
49881 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49882 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49883 NewMul =
49884 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49885 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49886 // To negate, reverse the operands of the subtract.
49887 if (SignMulAmt < 0)
49888 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49889 else
49890 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49891 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49892 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49893 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49894 NewMul =
49895 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49896 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49897 NewMul = DAG.getNode(
49898 ISD::ADD, DL, VT, NewMul,
49899 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49900 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49901 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49902 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49903 NewMul =
49904 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49905 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49906 NewMul = DAG.getNode(
49907 ISD::SUB, DL, VT, NewMul,
49908 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49909 } else if (SignMulAmt >= 0 && VT.isVector() &&
49910 Subtarget.fastImmVectorShift()) {
49911 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49912 uint64_t ShiftAmt1;
49913 std::optional<unsigned> Opc;
49914 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49915 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49916 Opc = ISD::ADD;
49917 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49918 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49919 Opc = ISD::SUB;
49920 }
49921
49922 if (Opc) {
49923 SDValue Shift1 =
49924 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49925 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49926 SDValue Shift2 =
49927 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49928 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49929 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49930 }
49931 }
49932 }
49933
49934 return NewMul;
49935}
49936
49937// Try to form a MULHU or MULHS node by looking for
49938// (srl (mul ext, ext), 16)
49939// TODO: This is X86 specific because we want to be able to handle wide types
49940// before type legalization. But we can only do it if the vector will be
49941// legalized via widening/splitting. Type legalization can't handle promotion
49942// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49943// combiner.
49945 const SDLoc &DL,
49946 const X86Subtarget &Subtarget) {
49947 using namespace SDPatternMatch;
49948 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49949 "SRL or SRA node is required here!");
49950
49951 if (!Subtarget.hasSSE2())
49952 return SDValue();
49953
49954 // Input type should be at least vXi32.
49955 EVT VT = N->getValueType(0);
49956 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49957 return SDValue();
49958
49959 // The operation must be a multiply shifted right by 16.
49960 SDValue LHS, RHS;
49961 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49962 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49963 return SDValue();
49964
49965 unsigned ExtOpc = LHS.getOpcode();
49966 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49967 RHS.getOpcode() != ExtOpc)
49968 return SDValue();
49969
49970 // Peek through the extends.
49971 LHS = LHS.getOperand(0);
49972 RHS = RHS.getOperand(0);
49973
49974 // Ensure the input types match.
49975 EVT MulVT = LHS.getValueType();
49976 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49977 return SDValue();
49978
49979 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49980 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49981
49982 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49983 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49984}
49985
49987 const X86Subtarget &Subtarget) {
49988 using namespace llvm::SDPatternMatch;
49989 SDValue N0 = N->getOperand(0);
49990 SDValue N1 = N->getOperand(1);
49992 EVT VT = N0.getValueType();
49993 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49994 SDLoc DL(N);
49995
49996 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49997 // with out-of-bounds clamping.
49998 if (N0.getOpcode() == ISD::VSELECT &&
49999 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50000 SDValue Cond = N0.getOperand(0);
50001 SDValue N00 = N0.getOperand(1);
50002 SDValue N01 = N0.getOperand(2);
50003 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50005 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50007 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50008 }
50009 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50011 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50013 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50014 }
50015 }
50016
50017 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50018 // since the result of setcc_c is all zero's or all ones.
50019 if (VT.isInteger() && !VT.isVector() &&
50020 N1C && N0.getOpcode() == ISD::AND &&
50021 N0.getOperand(1).getOpcode() == ISD::Constant) {
50022 SDValue N00 = N0.getOperand(0);
50023 APInt Mask = N0.getConstantOperandAPInt(1);
50024 Mask <<= N1C->getAPIntValue();
50025 bool MaskOK = false;
50026 // We can handle cases concerning bit-widening nodes containing setcc_c if
50027 // we carefully interrogate the mask to make sure we are semantics
50028 // preserving.
50029 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50030 // of the underlying setcc_c operation if the setcc_c was zero extended.
50031 // Consider the following example:
50032 // zext(setcc_c) -> i32 0x0000FFFF
50033 // c1 -> i32 0x0000FFFF
50034 // c2 -> i32 0x00000001
50035 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50036 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50037 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50038 MaskOK = true;
50039 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50041 MaskOK = true;
50042 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50043 N00.getOpcode() == ISD::ANY_EXTEND) &&
50045 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50046 }
50047 if (MaskOK && Mask != 0)
50048 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50049 }
50050
50051 return SDValue();
50052}
50053
50055 const X86Subtarget &Subtarget) {
50056 using namespace llvm::SDPatternMatch;
50057 SDValue N0 = N->getOperand(0);
50058 SDValue N1 = N->getOperand(1);
50059 EVT VT = N0.getValueType();
50060 unsigned Size = VT.getSizeInBits();
50061 SDLoc DL(N);
50062
50063 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50064 return V;
50065
50066 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50067 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50068 SDValue ShrAmtVal;
50069 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50071 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50072 }
50073
50074 // fold (SRA (SHL X, ShlConst), SraConst)
50075 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50076 // or (sext_in_reg X)
50077 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50078 // depending on relation between SraConst and ShlConst.
50079 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50080 // us to do the sext_in_reg from corresponding bit.
50081
50082 // sexts in X86 are MOVs. The MOVs have the same code size
50083 // as above SHIFTs (only SHIFT on 1 has lower code size).
50084 // However the MOVs have 2 advantages to a SHIFT:
50085 // 1. MOVs can write to a register that differs from source
50086 // 2. MOVs accept memory operands
50087
50088 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50089 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50091 return SDValue();
50092
50093 SDValue N00 = N0.getOperand(0);
50094 SDValue N01 = N0.getOperand(1);
50095 APInt ShlConst = N01->getAsAPIntVal();
50096 APInt SraConst = N1->getAsAPIntVal();
50097 EVT CVT = N1.getValueType();
50098
50099 if (CVT != N01.getValueType())
50100 return SDValue();
50101 if (SraConst.isNegative())
50102 return SDValue();
50103
50104 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50105 unsigned ShiftSize = SVT.getSizeInBits();
50106 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50107 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50108 continue;
50109 SDValue NN =
50110 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50111 if (SraConst.eq(ShlConst))
50112 return NN;
50113 if (SraConst.ult(ShlConst))
50114 return DAG.getNode(ISD::SHL, DL, VT, NN,
50115 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50116 return DAG.getNode(ISD::SRA, DL, VT, NN,
50117 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50118 }
50119 return SDValue();
50120}
50121
50124 const X86Subtarget &Subtarget) {
50125 using namespace llvm::SDPatternMatch;
50126 SDValue N0 = N->getOperand(0);
50127 SDValue N1 = N->getOperand(1);
50128 EVT VT = N0.getValueType();
50129 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50130 SDLoc DL(N);
50131
50132 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50133 return V;
50134
50135 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50136 // with out-of-bounds clamping.
50137 if (N0.getOpcode() == ISD::VSELECT &&
50138 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50139 SDValue Cond = N0.getOperand(0);
50140 SDValue N00 = N0.getOperand(1);
50141 SDValue N01 = N0.getOperand(2);
50142 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50144 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50146 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50147 }
50148 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50150 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50152 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50153 }
50154 }
50155
50156 // Only do this on the last DAG combine as it can interfere with other
50157 // combines.
50158 if (!DCI.isAfterLegalizeDAG())
50159 return SDValue();
50160
50161 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50162 // TODO: This is a generic DAG combine that became an x86-only combine to
50163 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50164 // and-not ('andn').
50165 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50166 return SDValue();
50167
50168 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50169 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50170 if (!ShiftC || !AndC)
50171 return SDValue();
50172
50173 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50174 // transform should reduce code size. It may also enable secondary transforms
50175 // from improved known-bits analysis or instruction selection.
50176 APInt MaskVal = AndC->getAPIntValue();
50177
50178 // If this can be matched by a zero extend, don't optimize.
50179 if (MaskVal.isMask()) {
50180 unsigned TO = MaskVal.countr_one();
50181 if (TO >= 8 && isPowerOf2_32(TO))
50182 return SDValue();
50183 }
50184
50185 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50186 unsigned OldMaskSize = MaskVal.getSignificantBits();
50187 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50188 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50189 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50190 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50191 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50192 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50193 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50194 }
50195 return SDValue();
50196}
50197
50199 const X86Subtarget &Subtarget) {
50200 unsigned Opcode = N->getOpcode();
50201 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50202
50203 SDLoc DL(N);
50204 EVT VT = N->getValueType(0);
50205 SDValue N0 = N->getOperand(0);
50206 SDValue N1 = N->getOperand(1);
50207 EVT SrcVT = N0.getValueType();
50208
50209 SDValue BC0 =
50210 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50211 SDValue BC1 =
50212 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50213
50214 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50215 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50216 // truncation trees that help us avoid lane crossing shuffles.
50217 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50218 // TODO: We don't handle vXf64 shuffles yet.
50219 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50220 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50222 SmallVector<int> ShuffleMask, ScaledMask;
50223 SDValue Vec = peekThroughBitcasts(BCSrc);
50224 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50226 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50227 // shuffle to a v4X64 width - we can probably relax this in the future.
50228 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50229 ShuffleOps[0].getValueType().is256BitVector() &&
50230 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50231 SDValue Lo, Hi;
50232 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50233 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50234 Lo = DAG.getBitcast(SrcVT, Lo);
50235 Hi = DAG.getBitcast(SrcVT, Hi);
50236 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50237 Res = DAG.getBitcast(ShufVT, Res);
50238 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50239 return DAG.getBitcast(VT, Res);
50240 }
50241 }
50242 }
50243 }
50244
50245 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50246 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50247 // If either/both ops are a shuffle that can scale to v2x64,
50248 // then see if we can perform this as a v4x32 post shuffle.
50249 SmallVector<SDValue> Ops0, Ops1;
50250 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50251 bool IsShuf0 =
50252 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50253 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50254 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50255 bool IsShuf1 =
50256 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50257 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50258 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50259 if (IsShuf0 || IsShuf1) {
50260 if (!IsShuf0) {
50261 Ops0.assign({BC0});
50262 ScaledMask0.assign({0, 1});
50263 }
50264 if (!IsShuf1) {
50265 Ops1.assign({BC1});
50266 ScaledMask1.assign({0, 1});
50267 }
50268
50269 SDValue LHS, RHS;
50270 int PostShuffle[4] = {-1, -1, -1, -1};
50271 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50272 if (M < 0)
50273 return true;
50274 Idx = M % 2;
50275 SDValue Src = Ops[M / 2];
50276 if (!LHS || LHS == Src) {
50277 LHS = Src;
50278 return true;
50279 }
50280 if (!RHS || RHS == Src) {
50281 Idx += 2;
50282 RHS = Src;
50283 return true;
50284 }
50285 return false;
50286 };
50287 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50288 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50289 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50290 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50291 LHS = DAG.getBitcast(SrcVT, LHS);
50292 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50293 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50294 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50295 Res = DAG.getBitcast(ShufVT, Res);
50296 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50297 return DAG.getBitcast(VT, Res);
50298 }
50299 }
50300 }
50301
50302 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50303 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50304 SmallVector<int> Mask0, Mask1;
50305 SmallVector<SDValue> Ops0, Ops1;
50306 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50307 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50308 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50309 !Ops0.empty() && !Ops1.empty() &&
50310 all_of(Ops0,
50311 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50312 all_of(Ops1,
50313 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50314 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50315 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50316 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50317 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50318 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50319 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50320 if ((Op00 == Op11) && (Op01 == Op10)) {
50321 std::swap(Op10, Op11);
50323 }
50324 if ((Op00 == Op10) && (Op01 == Op11)) {
50325 const int Map[4] = {0, 2, 1, 3};
50326 SmallVector<int, 4> ShuffleMask(
50327 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50328 Map[ScaledMask1[1]]});
50329 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50330 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50331 DAG.getBitcast(SrcVT, Op01));
50332 Res = DAG.getBitcast(ShufVT, Res);
50333 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50334 return DAG.getBitcast(VT, Res);
50335 }
50336 }
50337 }
50338
50339 return SDValue();
50340}
50341
50344 const X86Subtarget &Subtarget) {
50345 unsigned Opcode = N->getOpcode();
50346 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50347 "Unexpected pack opcode");
50348
50349 EVT VT = N->getValueType(0);
50350 SDValue N0 = N->getOperand(0);
50351 SDValue N1 = N->getOperand(1);
50352 unsigned NumDstElts = VT.getVectorNumElements();
50353 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50354 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50355 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50356 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50357 "Unexpected PACKSS/PACKUS input type");
50358
50359 bool IsSigned = (X86ISD::PACKSS == Opcode);
50360
50361 // Constant Folding.
50362 APInt UndefElts0, UndefElts1;
50363 SmallVector<APInt, 32> EltBits0, EltBits1;
50364 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50365 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50366 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50367 /*AllowWholeUndefs*/ true,
50368 /*AllowPartialUndefs*/ true) &&
50369 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50370 /*AllowWholeUndefs*/ true,
50371 /*AllowPartialUndefs*/ true)) {
50372 unsigned NumLanes = VT.getSizeInBits() / 128;
50373 unsigned NumSrcElts = NumDstElts / 2;
50374 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50375 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50376
50377 APInt Undefs(NumDstElts, 0);
50378 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50379 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50380 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50381 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50382 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50383 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50384
50385 if (UndefElts[SrcIdx]) {
50386 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50387 continue;
50388 }
50389
50390 APInt &Val = EltBits[SrcIdx];
50391 if (IsSigned) {
50392 // PACKSS: Truncate signed value with signed saturation.
50393 // Source values less than dst minint are saturated to minint.
50394 // Source values greater than dst maxint are saturated to maxint.
50395 Val = Val.truncSSat(DstBitsPerElt);
50396 } else {
50397 // PACKUS: Truncate signed value with unsigned saturation.
50398 // Source values less than zero are saturated to zero.
50399 // Source values greater than dst maxuint are saturated to maxuint.
50400 // NOTE: This is different from APInt::truncUSat.
50401 if (Val.isIntN(DstBitsPerElt))
50402 Val = Val.trunc(DstBitsPerElt);
50403 else if (Val.isNegative())
50404 Val = APInt::getZero(DstBitsPerElt);
50405 else
50406 Val = APInt::getAllOnes(DstBitsPerElt);
50407 }
50408 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50409 }
50410 }
50411
50412 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50413 }
50414
50415 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50416 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50417 return V;
50418
50419 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50420 // Currently limit this to allsignbits cases only.
50421 if (IsSigned &&
50422 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50423 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50424 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50425 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50426 if (Not0 && Not1) {
50427 SDLoc DL(N);
50428 MVT SrcVT = N0.getSimpleValueType();
50429 SDValue Pack =
50430 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50431 DAG.getBitcast(SrcVT, Not1));
50432 return DAG.getNOT(DL, Pack, VT);
50433 }
50434 }
50435
50436 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50437 // truncate to create a larger truncate.
50438 if (Subtarget.hasAVX512() &&
50439 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50440 N0.getOperand(0).getValueType() == MVT::v8i32) {
50441 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50442 (!IsSigned &&
50443 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50444 if (Subtarget.hasVLX())
50445 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50446
50447 // Widen input to v16i32 so we can truncate that.
50448 SDLoc dl(N);
50449 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50450 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50451 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50452 }
50453 }
50454
50455 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50456 if (VT.is128BitVector()) {
50457 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50458 SDValue Src0, Src1;
50459 if (N0.getOpcode() == ExtOpc &&
50461 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50462 Src0 = N0.getOperand(0);
50463 }
50464 if (N1.getOpcode() == ExtOpc &&
50466 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50467 Src1 = N1.getOperand(0);
50468 }
50469 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50470 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50471 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50472 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50473 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50474 }
50475
50476 // Try again with pack(*_extend_vector_inreg, undef).
50477 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50479 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50480 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50481 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50482 DAG);
50483 }
50484
50485 // Attempt to combine as shuffle.
50486 SDValue Op(N, 0);
50487 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50488 return Res;
50489
50490 return SDValue();
50491}
50492
50495 const X86Subtarget &Subtarget) {
50496 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50497 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50498 "Unexpected horizontal add/sub opcode");
50499
50500 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50501 MVT VT = N->getSimpleValueType(0);
50502 SDValue LHS = N->getOperand(0);
50503 SDValue RHS = N->getOperand(1);
50504
50505 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50506 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50507 LHS.getOpcode() == RHS.getOpcode() &&
50508 LHS.getValueType() == RHS.getValueType() &&
50509 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50510 SDValue LHS0 = LHS.getOperand(0);
50511 SDValue LHS1 = LHS.getOperand(1);
50512 SDValue RHS0 = RHS.getOperand(0);
50513 SDValue RHS1 = RHS.getOperand(1);
50514 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50515 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50516 SDLoc DL(N);
50517 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50518 LHS0.isUndef() ? LHS1 : LHS0,
50519 RHS0.isUndef() ? RHS1 : RHS0);
50520 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50521 Res = DAG.getBitcast(ShufVT, Res);
50522 SDValue NewLHS =
50523 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50524 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50525 SDValue NewRHS =
50526 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50527 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50528 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50529 DAG.getBitcast(VT, NewRHS));
50530 }
50531 }
50532 }
50533
50534 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50535 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50536 return V;
50537
50538 return SDValue();
50539}
50540
50543 const X86Subtarget &Subtarget) {
50544 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50545 X86ISD::VSRL == N->getOpcode()) &&
50546 "Unexpected shift opcode");
50547 EVT VT = N->getValueType(0);
50548 SDValue N0 = N->getOperand(0);
50549 SDValue N1 = N->getOperand(1);
50550
50551 // Shift zero -> zero.
50553 return DAG.getConstant(0, SDLoc(N), VT);
50554
50555 // Detect constant shift amounts.
50556 APInt UndefElts;
50557 SmallVector<APInt, 32> EltBits;
50558 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50559 /*AllowWholeUndefs*/ true,
50560 /*AllowPartialUndefs*/ false)) {
50561 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50562 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50563 EltBits[0].getZExtValue(), DAG);
50564 }
50565
50566 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50567 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50568 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50569 return SDValue(N, 0);
50570
50571 return SDValue();
50572}
50573
50576 const X86Subtarget &Subtarget) {
50577 unsigned Opcode = N->getOpcode();
50578 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50579 X86ISD::VSRLI == Opcode) &&
50580 "Unexpected shift opcode");
50581 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50582 EVT VT = N->getValueType(0);
50583 SDValue N0 = N->getOperand(0);
50584 SDValue N1 = N->getOperand(1);
50585 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50586 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50587 "Unexpected value type");
50588 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50589
50590 // (shift undef, X) -> 0
50591 if (N0.isUndef())
50592 return DAG.getConstant(0, SDLoc(N), VT);
50593
50594 // Out of range logical bit shifts are guaranteed to be zero.
50595 // Out of range arithmetic bit shifts splat the sign bit.
50596 unsigned ShiftVal = N->getConstantOperandVal(1);
50597 if (ShiftVal >= NumBitsPerElt) {
50598 if (LogicalShift)
50599 return DAG.getConstant(0, SDLoc(N), VT);
50600 ShiftVal = NumBitsPerElt - 1;
50601 }
50602
50603 // (shift X, 0) -> X
50604 if (!ShiftVal)
50605 return N0;
50606
50607 // (shift 0, C) -> 0
50609 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50610 // result are all zeros, not undef.
50611 return DAG.getConstant(0, SDLoc(N), VT);
50612
50613 // (VSRAI -1, C) -> -1
50614 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50615 // N0 is all ones or undef. We guarantee that the bits shifted into the
50616 // result are all ones, not undef.
50617 return DAG.getAllOnesConstant(SDLoc(N), VT);
50618
50619 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50620 unsigned NewShiftVal = Amt0 + Amt1;
50621 if (NewShiftVal >= NumBitsPerElt) {
50622 // Out of range logical bit shifts are guaranteed to be zero.
50623 // Out of range arithmetic bit shifts splat the sign bit.
50624 if (LogicalShift)
50625 return DAG.getConstant(0, SDLoc(N), VT);
50626 NewShiftVal = NumBitsPerElt - 1;
50627 }
50628 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50629 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50630 };
50631
50632 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50633 if (Opcode == N0.getOpcode())
50634 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50635
50636 // (shl (add X, X), C) -> (shl X, (C + 1))
50637 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50638 N0.getOperand(0) == N0.getOperand(1))
50639 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50640
50641 // We can decode 'whole byte' logical bit shifts as shuffles.
50642 if (LogicalShift && (ShiftVal % 8) == 0) {
50643 SDValue Op(N, 0);
50644 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50645 return Res;
50646 }
50647
50648 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50649 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50650 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50651 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50652 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50653 N0.getOpcode() == X86ISD::PSHUFD &&
50654 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50655 N0->hasOneUse()) {
50657 if (BC.getOpcode() == X86ISD::VSHLI &&
50658 BC.getScalarValueSizeInBits() == 64 &&
50659 BC.getConstantOperandVal(1) == 63) {
50660 SDLoc DL(N);
50661 SDValue Src = BC.getOperand(0);
50662 Src = DAG.getBitcast(VT, Src);
50663 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50664 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50665 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50666 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50667 return Src;
50668 }
50669 }
50670
50671 auto TryConstantFold = [&](SDValue V) {
50672 APInt UndefElts;
50673 SmallVector<APInt, 32> EltBits;
50674 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50675 /*AllowWholeUndefs*/ true,
50676 /*AllowPartialUndefs*/ true))
50677 return SDValue();
50678 assert(EltBits.size() == VT.getVectorNumElements() &&
50679 "Unexpected shift value type");
50680 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50681 // created an undef input due to no input bits being demanded, but user
50682 // still expects 0 in other bits.
50683 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50684 APInt &Elt = EltBits[i];
50685 if (UndefElts[i])
50686 Elt = 0;
50687 else if (X86ISD::VSHLI == Opcode)
50688 Elt <<= ShiftVal;
50689 else if (X86ISD::VSRAI == Opcode)
50690 Elt.ashrInPlace(ShiftVal);
50691 else
50692 Elt.lshrInPlace(ShiftVal);
50693 }
50694 // Reset undef elements since they were zeroed above.
50695 UndefElts = 0;
50696 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50697 };
50698
50699 // Constant Folding.
50700 if (N->isOnlyUserOf(N0.getNode())) {
50701 if (SDValue C = TryConstantFold(N0))
50702 return C;
50703
50704 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50705 // Don't break NOT patterns.
50707 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50708 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50710 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50711 SDLoc DL(N);
50712 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50713 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50714 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50715 }
50716 }
50717 }
50718
50719 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50720 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50721 DCI))
50722 return SDValue(N, 0);
50723
50724 return SDValue();
50725}
50726
50729 const X86Subtarget &Subtarget) {
50730 EVT VT = N->getValueType(0);
50731 unsigned Opcode = N->getOpcode();
50732 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50733 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50734 Opcode == ISD::INSERT_VECTOR_ELT) &&
50735 "Unexpected vector insertion");
50736
50737 SDValue Vec = N->getOperand(0);
50738 SDValue Scl = N->getOperand(1);
50739 SDValue Idx = N->getOperand(2);
50740
50741 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50742 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50743 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50744
50745 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50746 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50747 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50748 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50749 APInt::getAllOnes(NumBitsPerElt), DCI))
50750 return SDValue(N, 0);
50751 }
50752
50753 // Attempt to combine insertion patterns to a shuffle.
50754 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50755 SDValue Op(N, 0);
50756 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50757 return Res;
50758 }
50759
50760 return SDValue();
50761}
50762
50763/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50764/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50765/// OR -> CMPNEQSS.
50768 const X86Subtarget &Subtarget) {
50769 unsigned opcode;
50770
50771 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50772 // we're requiring SSE2 for both.
50773 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50774 SDValue N0 = N->getOperand(0);
50775 SDValue N1 = N->getOperand(1);
50776 SDValue CMP0 = N0.getOperand(1);
50777 SDValue CMP1 = N1.getOperand(1);
50778 SDLoc DL(N);
50779
50780 // The SETCCs should both refer to the same CMP.
50781 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50782 return SDValue();
50783
50784 SDValue CMP00 = CMP0->getOperand(0);
50785 SDValue CMP01 = CMP0->getOperand(1);
50786 EVT VT = CMP00.getValueType();
50787
50788 if (VT == MVT::f32 || VT == MVT::f64 ||
50789 (VT == MVT::f16 && Subtarget.hasFP16())) {
50790 bool ExpectingFlags = false;
50791 // Check for any users that want flags:
50792 for (const SDNode *U : N->users()) {
50793 if (ExpectingFlags)
50794 break;
50795
50796 switch (U->getOpcode()) {
50797 default:
50798 case ISD::BR_CC:
50799 case ISD::BRCOND:
50800 case ISD::SELECT:
50801 ExpectingFlags = true;
50802 break;
50803 case ISD::CopyToReg:
50804 case ISD::SIGN_EXTEND:
50805 case ISD::ZERO_EXTEND:
50806 case ISD::ANY_EXTEND:
50807 break;
50808 }
50809 }
50810
50811 if (!ExpectingFlags) {
50812 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50813 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50814
50815 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50816 X86::CondCode tmp = cc0;
50817 cc0 = cc1;
50818 cc1 = tmp;
50819 }
50820
50821 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50822 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50823 // FIXME: need symbolic constants for these magic numbers.
50824 // See X86ATTInstPrinter.cpp:printSSECC().
50825 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50826 if (Subtarget.hasAVX512()) {
50827 SDValue FSetCC =
50828 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50829 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50830 // Need to fill with zeros to ensure the bitcast will produce zeroes
50831 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50832 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50833 DAG.getConstant(0, DL, MVT::v16i1),
50834 FSetCC, DAG.getVectorIdxConstant(0, DL));
50835 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50836 N->getSimpleValueType(0));
50837 }
50838 SDValue OnesOrZeroesF =
50839 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50840 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50841
50842 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50843 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50844
50845 if (is64BitFP && !Subtarget.is64Bit()) {
50846 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50847 // 64-bit integer, since that's not a legal type. Since
50848 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50849 // bits, but can do this little dance to extract the lowest 32 bits
50850 // and work with those going forward.
50851 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50852 MVT::v2f64, OnesOrZeroesF);
50853 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50854 OnesOrZeroesF =
50855 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50856 DAG.getVectorIdxConstant(0, DL));
50857 IntVT = MVT::i32;
50858 }
50859
50860 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50861 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50862 DAG.getConstant(1, DL, IntVT));
50863 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50864 ANDed);
50865 return OneBitOfTruth;
50866 }
50867 }
50868 }
50869 }
50870 return SDValue();
50871}
50872
50873/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50875 SelectionDAG &DAG) {
50876 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50877
50878 MVT VT = N->getSimpleValueType(0);
50879 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50880 return SDValue();
50881
50882 SDValue X, Y;
50883 SDValue N0 = N->getOperand(0);
50884 SDValue N1 = N->getOperand(1);
50885
50886 if (SDValue Not = IsNOT(N0, DAG)) {
50887 X = Not;
50888 Y = N1;
50889 } else if (SDValue Not = IsNOT(N1, DAG)) {
50890 X = Not;
50891 Y = N0;
50892 } else
50893 return SDValue();
50894
50895 X = DAG.getBitcast(VT, X);
50896 Y = DAG.getBitcast(VT, Y);
50897 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50898}
50899
50900/// Try to fold:
50901/// and (vector_shuffle<Z,...,Z>
50902/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50903/// ->
50904/// andnp (vector_shuffle<Z,...,Z>
50905/// (insert_vector_elt undef, X, Z), undef), Y
50907 const X86Subtarget &Subtarget) {
50908 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50909
50910 EVT VT = N->getValueType(0);
50911 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50912 // value and require extra moves.
50913 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50914 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50915 return SDValue();
50916
50917 auto GetNot = [&DAG](SDValue V) {
50919 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50920 // end-users are ISD::AND including cases
50921 // (and(extract_vector_element(SVN), Y)).
50922 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50923 !SVN->getOperand(1).isUndef()) {
50924 return SDValue();
50925 }
50926 SDValue IVEN = SVN->getOperand(0);
50927 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50928 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50929 return SDValue();
50930 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50931 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50932 return SDValue();
50933 SDValue Src = IVEN.getOperand(1);
50934 if (SDValue Not = IsNOT(Src, DAG)) {
50935 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50936 SDValue NotIVEN =
50938 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50939 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50940 SVN->getOperand(1), SVN->getMask());
50941 }
50942 return SDValue();
50943 };
50944
50945 SDValue X, Y;
50946 SDValue N0 = N->getOperand(0);
50947 SDValue N1 = N->getOperand(1);
50948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50949
50950 if (SDValue Not = GetNot(N0)) {
50951 X = Not;
50952 Y = N1;
50953 } else if (SDValue Not = GetNot(N1)) {
50954 X = Not;
50955 Y = N0;
50956 } else
50957 return SDValue();
50958
50959 X = DAG.getBitcast(VT, X);
50960 Y = DAG.getBitcast(VT, Y);
50961 SDLoc DL(N);
50962
50963 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50964 // AVX2.
50965 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50967 SDValue LoX, HiX;
50968 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50969 SDValue LoY, HiY;
50970 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50971 EVT SplitVT = LoX.getValueType();
50972 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50973 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50974 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50975 }
50976
50977 if (TLI.isTypeLegal(VT))
50978 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50979
50980 return SDValue();
50981}
50982
50983// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50984// logical operations, like in the example below.
50985// or (and (truncate x, truncate y)),
50986// (xor (truncate z, build_vector (constants)))
50987// Given a target type \p VT, we generate
50988// or (and x, y), (xor z, zext(build_vector (constants)))
50989// given x, y and z are of type \p VT. We can do so, if operands are either
50990// truncates from VT types, the second operand is a vector of constants, can
50991// be recursively promoted or is an existing extension we can extend further.
50993 SelectionDAG &DAG,
50994 const X86Subtarget &Subtarget,
50995 unsigned Depth) {
50996 // Limit recursion to avoid excessive compile times.
50998 return SDValue();
50999
51000 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51001 return SDValue();
51002
51003 SDValue N0 = N.getOperand(0);
51004 SDValue N1 = N.getOperand(1);
51005
51006 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51007 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51008 return SDValue();
51009
51010 if (SDValue NN0 =
51011 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51012 N0 = NN0;
51013 else {
51014 // The left side has to be a 'trunc'.
51015 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51016 N0.getOperand(0).getValueType() == VT;
51017 if (LHSTrunc)
51018 N0 = N0.getOperand(0);
51019 else
51020 return SDValue();
51021 }
51022
51023 if (SDValue NN1 =
51024 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51025 N1 = NN1;
51026 else {
51027 // The right side has to be a 'trunc', a (foldable) constant or an
51028 // existing extension we can extend further.
51029 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51030 N1.getOperand(0).getValueType() == VT;
51031 if (RHSTrunc)
51032 N1 = N1.getOperand(0);
51033 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51034 Subtarget.hasInt256() && N1.hasOneUse())
51035 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51036 else if (SDValue Cst =
51038 N1 = Cst;
51039 else
51040 return SDValue();
51041 }
51042
51043 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51044}
51045
51046// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51047// register. In most cases we actually compare or select YMM-sized registers
51048// and mixing the two types creates horrible code. This method optimizes
51049// some of the transition sequences.
51050// Even with AVX-512 this is still useful for removing casts around logical
51051// operations on vXi1 mask types.
51053 SelectionDAG &DAG,
51054 const X86Subtarget &Subtarget) {
51055 EVT VT = N.getValueType();
51056 assert(VT.isVector() && "Expected vector type");
51057 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51058 N.getOpcode() == ISD::ZERO_EXTEND ||
51059 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51060
51061 SDValue Narrow = N.getOperand(0);
51062 EVT NarrowVT = Narrow.getValueType();
51063
51064 // Generate the wide operation.
51065 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51066 if (!Op)
51067 return SDValue();
51068 switch (N.getOpcode()) {
51069 default: llvm_unreachable("Unexpected opcode");
51070 case ISD::ANY_EXTEND:
51071 return Op;
51072 case ISD::ZERO_EXTEND:
51073 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51074 case ISD::SIGN_EXTEND:
51075 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51076 Op, DAG.getValueType(NarrowVT));
51077 }
51078}
51079
51080static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51081 unsigned FPOpcode;
51082 switch (Opcode) {
51083 // clang-format off
51084 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51085 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51086 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51087 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51088 // clang-format on
51089 }
51090 return FPOpcode;
51091}
51092
51093/// If both input operands of a logic op are being cast from floating-point
51094/// types or FP compares, try to convert this into a floating-point logic node
51095/// to avoid unnecessary moves from SSE to integer registers.
51096static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51097 SDValue N0, SDValue N1,
51098 SelectionDAG &DAG,
51100 const X86Subtarget &Subtarget) {
51101 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51102 "Unexpected bit opcode");
51103
51104 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51105 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51106 return SDValue();
51107
51108 SDValue N00 = N0.getOperand(0);
51109 SDValue N10 = N1.getOperand(0);
51110 EVT N00Type = N00.getValueType();
51111 EVT N10Type = N10.getValueType();
51112
51113 // Ensure that both types are the same and are legal scalar fp types.
51114 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51115 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51116 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51117 return SDValue();
51118
51119 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51120 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51121 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51122 return DAG.getBitcast(VT, FPLogic);
51123 }
51124
51125 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51126 !N1.hasOneUse())
51127 return SDValue();
51128
51129 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51130 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51131
51132 // The vector ISA for FP predicates is incomplete before AVX, so converting
51133 // COMIS* to CMPS* may not be a win before AVX.
51134 if (!Subtarget.hasAVX() &&
51135 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51136 return SDValue();
51137
51138 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51139 // and vector logic:
51140 // logic (setcc N00, N01), (setcc N10, N11) -->
51141 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51142 unsigned NumElts = 128 / N00Type.getSizeInBits();
51143 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51144 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51145 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51146 SDValue N01 = N0.getOperand(1);
51147 SDValue N11 = N1.getOperand(1);
51148 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51149 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51150 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51151 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51152 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51153 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51154 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51155 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51156}
51157
51158// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51159// to reduce XMM->GPR traffic.
51160static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51161 SDValue N1, SelectionDAG &DAG) {
51162 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51163 "Unexpected bit opcode");
51164
51165 // Both operands must be single use MOVMSK.
51166 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51167 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51168 return SDValue();
51169
51170 SDValue Vec0 = N0.getOperand(0);
51171 SDValue Vec1 = N1.getOperand(0);
51172 EVT VecVT0 = Vec0.getValueType();
51173 EVT VecVT1 = Vec1.getValueType();
51174
51175 // Both MOVMSK operands must be from vectors of the same size and same element
51176 // size, but its OK for a fp/int diff.
51177 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51178 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51179 return SDValue();
51180
51181 unsigned VecOpc =
51183 SDValue Result =
51184 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51185 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51186}
51187
51188// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51189// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51190// handles in InstCombine.
51191static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51192 SDValue N0, SDValue N1,
51193 SelectionDAG &DAG) {
51194 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51195 "Unexpected bit opcode");
51196
51197 // Both operands must be single use.
51198 if (!N0.hasOneUse() || !N1.hasOneUse())
51199 return SDValue();
51200
51201 // Search for matching shifts.
51204
51205 unsigned BCOpc = BC0.getOpcode();
51206 EVT BCVT = BC0.getValueType();
51207 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51208 return SDValue();
51209
51210 switch (BCOpc) {
51211 case X86ISD::VSHLI:
51212 case X86ISD::VSRLI:
51213 case X86ISD::VSRAI: {
51214 if (BC0.getOperand(1) != BC1.getOperand(1))
51215 return SDValue();
51216 SDValue BitOp =
51217 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51218 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51219 return DAG.getBitcast(VT, Shift);
51220 }
51221 }
51222
51223 return SDValue();
51224}
51225
51226// Attempt to fold:
51227// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51228// TODO: Handle PACKUS handling.
51229static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51230 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51231 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51232 "Unexpected bit opcode");
51233
51234 // Both operands must be single use.
51235 if (!N0.hasOneUse() || !N1.hasOneUse())
51236 return SDValue();
51237
51238 // Search for matching packs.
51241
51242 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51243 return SDValue();
51244
51245 MVT DstVT = N0.getSimpleValueType();
51246 if (DstVT != N1.getSimpleValueType())
51247 return SDValue();
51248
51249 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51250 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51251
51252 // Limit to allsignbits packing.
51253 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51254 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51255 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51256 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51257 return SDValue();
51258
51259 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51260 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51261 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51262}
51263
51264/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51265/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51266/// with a shift-right to eliminate loading the vector constant mask value.
51268 SelectionDAG &DAG,
51269 const X86Subtarget &Subtarget) {
51270 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51271 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51272 EVT VT = Op0.getValueType();
51273 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51274 return SDValue();
51275
51276 // Try to convert an "is positive" signbit masking operation into arithmetic
51277 // shift and "andn". This saves a materialization of a -1 vector constant.
51278 // The "is negative" variant should be handled more generally because it only
51279 // requires "and" rather than "andn":
51280 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51281 //
51282 // This is limited to the original type to avoid producing even more bitcasts.
51283 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51284 // will be profitable.
51285 if (N->getValueType(0) == VT &&
51286 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51287 SDValue X, Y;
51288 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51289 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51290 X = Op1.getOperand(0);
51291 Y = Op0;
51292 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51293 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51294 X = Op0.getOperand(0);
51295 Y = Op1;
51296 }
51297 if (X && Y) {
51298 SDValue Sra =
51300 VT.getScalarSizeInBits() - 1, DAG);
51301 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51302 }
51303 }
51304
51305 APInt SplatVal;
51306 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51307 return SDValue();
51308
51309 // Don't prevent creation of ANDN.
51310 if (isBitwiseNot(Op0))
51311 return SDValue();
51312
51313 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51314 return SDValue();
51315
51316 unsigned EltBitWidth = VT.getScalarSizeInBits();
51317 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51318 return SDValue();
51319
51320 unsigned ShiftVal = SplatVal.countr_one();
51321 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51322 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51323 return DAG.getBitcast(N->getValueType(0), Shift);
51324}
51325
51326// Get the index node from the lowered DAG of a GEP IR instruction with one
51327// indexing dimension.
51329 if (Ld->isIndexed())
51330 return SDValue();
51331
51332 SDValue Base = Ld->getBasePtr();
51333 if (Base.getOpcode() != ISD::ADD)
51334 return SDValue();
51335
51336 SDValue ShiftedIndex = Base.getOperand(0);
51337 if (ShiftedIndex.getOpcode() != ISD::SHL)
51338 return SDValue();
51339
51340 return ShiftedIndex.getOperand(0);
51341}
51342
51343static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51344 return Subtarget.hasBMI2() &&
51345 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51346}
51347
51348/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51349/// This undoes the inverse fold performed in InstCombine
51351 SelectionDAG &DAG) {
51352 using namespace llvm::SDPatternMatch;
51353 MVT VT = N->getSimpleValueType(0);
51354 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51355 return SDValue();
51356
51357 SDValue X, Y, Z;
51358 if (sd_match(N, m_And(m_Value(X),
51359 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51360 // Don't fold if Y or Z are constants to prevent infinite loops.
51363 return DAG.getNode(
51364 ISD::AND, DL, VT, X,
51365 DAG.getNOT(
51366 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51367 }
51368
51369 return SDValue();
51370}
51371
51372// This function recognizes cases where X86 bzhi instruction can replace and
51373// 'and-load' sequence.
51374// In case of loading integer value from an array of constants which is defined
51375// as follows:
51376//
51377// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51378//
51379// then applying a bitwise and on the result with another input.
51380// It's equivalent to performing bzhi (zero high bits) on the input, with the
51381// same index of the load.
51383 const X86Subtarget &Subtarget) {
51384 MVT VT = Node->getSimpleValueType(0);
51385 SDLoc dl(Node);
51386
51387 // Check if subtarget has BZHI instruction for the node's type
51388 if (!hasBZHI(Subtarget, VT))
51389 return SDValue();
51390
51391 // Try matching the pattern for both operands.
51392 for (unsigned i = 0; i < 2; i++) {
51393 // continue if the operand is not a load instruction
51394 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51395 if (!Ld)
51396 continue;
51397 const Value *MemOp = Ld->getMemOperand()->getValue();
51398 if (!MemOp)
51399 continue;
51400 // Get the Node which indexes into the array.
51402 if (!Index)
51403 continue;
51404
51405 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51406 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51407 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51408 Constant *Init = GV->getInitializer();
51409 Type *Ty = Init->getType();
51411 !Ty->getArrayElementType()->isIntegerTy() ||
51412 Ty->getArrayElementType()->getScalarSizeInBits() !=
51413 VT.getSizeInBits() ||
51414 Ty->getArrayNumElements() >
51415 Ty->getArrayElementType()->getScalarSizeInBits())
51416 continue;
51417
51418 // Check if the array's constant elements are suitable to our case.
51419 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51420 bool ConstantsMatch = true;
51421 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51422 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51423 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51424 ConstantsMatch = false;
51425 break;
51426 }
51427 }
51428 if (!ConstantsMatch)
51429 continue;
51430
51431 // Do the transformation (For 32-bit type):
51432 // -> (and (load arr[idx]), inp)
51433 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51434 // that will be replaced with one bzhi instruction.
51435 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51436 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51437
51438 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51439 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51440 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51441
51442 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51443 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51444 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51445 }
51446 }
51447 }
51448 }
51449 return SDValue();
51450}
51451
51452// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51453// Where C is a mask containing the same number of bits as the setcc and
51454// where the setcc will freely 0 upper bits of k-register. We can replace the
51455// undef in the concat with 0s and remove the AND. This mainly helps with
51456// v2i1/v4i1 setcc being casted to scalar.
51458 const X86Subtarget &Subtarget) {
51459 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51460
51461 EVT VT = N->getValueType(0);
51462
51463 // Make sure this is an AND with constant. We will check the value of the
51464 // constant later.
51465 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51466 if (!C1)
51467 return SDValue();
51468
51469 // This is implied by the ConstantSDNode.
51470 assert(!VT.isVector() && "Expected scalar VT!");
51471
51472 SDValue Src = N->getOperand(0);
51473 if (!Src.hasOneUse())
51474 return SDValue();
51475
51476 // (Optionally) peek through any_extend().
51477 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51478 if (!Src.getOperand(0).hasOneUse())
51479 return SDValue();
51480 Src = Src.getOperand(0);
51481 }
51482
51483 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51484 return SDValue();
51485
51486 Src = Src.getOperand(0);
51487 EVT SrcVT = Src.getValueType();
51488
51489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51490 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51491 !TLI.isTypeLegal(SrcVT))
51492 return SDValue();
51493
51494 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51495 return SDValue();
51496
51497 // We only care about the first subvector of the concat, we expect the
51498 // other subvectors to be ignored due to the AND if we make the change.
51499 SDValue SubVec = Src.getOperand(0);
51500 EVT SubVecVT = SubVec.getValueType();
51501
51502 // The RHS of the AND should be a mask with as many bits as SubVec.
51503 if (!TLI.isTypeLegal(SubVecVT) ||
51504 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51505 return SDValue();
51506
51507 // First subvector should be a setcc with a legal result type or a
51508 // AND containing at least one setcc with a legal result type.
51509 auto IsLegalSetCC = [&](SDValue V) {
51510 if (V.getOpcode() != ISD::SETCC)
51511 return false;
51512 EVT SetccVT = V.getOperand(0).getValueType();
51513 if (!TLI.isTypeLegal(SetccVT) ||
51514 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51515 return false;
51516 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51517 return false;
51518 return true;
51519 };
51520 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51521 (IsLegalSetCC(SubVec.getOperand(0)) ||
51522 IsLegalSetCC(SubVec.getOperand(1))))))
51523 return SDValue();
51524
51525 // We passed all the checks. Rebuild the concat_vectors with zeroes
51526 // and cast it back to VT.
51527 SDLoc dl(N);
51528 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51529 DAG.getConstant(0, dl, SubVecVT));
51530 Ops[0] = SubVec;
51531 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51532 Ops);
51533 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51534 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51535}
51536
51538 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51539 // We don't want to go crazy with the recursion here. This isn't a super
51540 // important optimization.
51541 static constexpr unsigned kMaxDepth = 2;
51542
51543 // Only do this re-ordering if op has one use.
51544 if (!Op.hasOneUse())
51545 return SDValue();
51546
51547 SDLoc DL(Op);
51548 // If we hit another assosiative op, recurse further.
51549 if (Op.getOpcode() == Opc) {
51550 // Done recursing.
51551 if (Depth++ >= kMaxDepth)
51552 return SDValue();
51553
51554 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51555 if (SDValue R =
51556 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51557 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51558 Op.getOperand(1 - OpIdx));
51559
51560 } else if (Op.getOpcode() == ISD::SUB) {
51561 if (Opc == ISD::AND) {
51562 // BLSI: (and x, (sub 0, x))
51563 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51564 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51565 }
51566 // Opc must be ISD::AND or ISD::XOR
51567 // BLSR: (and x, (sub x, 1))
51568 // BLSMSK: (xor x, (sub x, 1))
51569 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51570 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51571
51572 } else if (Op.getOpcode() == ISD::ADD) {
51573 // Opc must be ISD::AND or ISD::XOR
51574 // BLSR: (and x, (add x, -1))
51575 // BLSMSK: (xor x, (add x, -1))
51576 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51577 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51578 }
51579 return SDValue();
51580}
51581
51583 const X86Subtarget &Subtarget) {
51584 EVT VT = N->getValueType(0);
51585 // Make sure this node is a candidate for BMI instructions.
51586 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51587 (VT != MVT::i32 && VT != MVT::i64))
51588 return SDValue();
51589
51590 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51591
51592 // Try and match LHS and RHS.
51593 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51594 if (SDValue OpMatch =
51595 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51596 N->getOperand(1 - OpIdx), 0))
51597 return OpMatch;
51598 return SDValue();
51599}
51600
51601/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51603 SelectionDAG &DAG,
51604 const X86Subtarget &Subtarget) {
51605 using namespace llvm::SDPatternMatch;
51606
51607 EVT VT = And->getValueType(0);
51608 // Make sure this node is a candidate for BMI instructions.
51609 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51610 return SDValue();
51611
51612 SDValue X;
51613 SDValue Y;
51616 m_Value(Y))))
51617 return SDValue();
51618
51619 SDValue BLSMSK =
51620 DAG.getNode(ISD::XOR, DL, VT, X,
51621 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51622 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51623 return AndN;
51624}
51625
51627 SelectionDAG &DAG,
51629 const X86Subtarget &ST) {
51630 // cmp(setcc(cc, X), 0)
51631 // brcond ne
51632 // ->
51633 // X
51634 // brcond cc
51635
51636 // sub(setcc(cc, X), 1)
51637 // brcond ne
51638 // ->
51639 // X
51640 // brcond ~cc
51641 //
51642 // if only flag has users
51643
51644 SDValue SetCC = N->getOperand(0);
51645
51646 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51647 return SDValue();
51648
51649 // Check the only user of flag is `brcond ne`.
51650 SDNode *BrCond = *Flag->user_begin();
51651 if (BrCond->getOpcode() != X86ISD::BRCOND)
51652 return SDValue();
51653 unsigned CondNo = 2;
51654 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51656 return SDValue();
51657
51658 SDValue X = SetCC.getOperand(1);
51659 // sub has two results while X only have one. DAG combine assumes the value
51660 // type matches.
51661 if (N->getOpcode() == X86ISD::SUB)
51662 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51663
51664 SDValue CCN = SetCC.getOperand(0);
51665 X86::CondCode CC =
51666 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51668 // Update CC for the consumer of the flag.
51669 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51670 // checking if the second condition evaluates to true. When comparing the
51671 // result with 1, we are checking uf the second condition evaluates to false.
51673 if (isNullConstant(N->getOperand(1)))
51674 Ops[CondNo] = CCN;
51675 else if (isOneConstant(N->getOperand(1)))
51676 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51677 else
51678 llvm_unreachable("expect constant 0 or 1");
51679
51680 SDValue NewBrCond =
51681 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51682 // Avoid self-assign error b/c CC1 can be `e/ne`.
51683 if (BrCond != NewBrCond.getNode())
51684 DCI.CombineTo(BrCond, NewBrCond);
51685 return X;
51686}
51687
51690 const X86Subtarget &ST) {
51691 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51692 // ->
51693 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51694
51695 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51696 // ->
51697 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51698 //
51699 // where cflags is determined by cc1.
51700
51701 if (!ST.hasCCMP())
51702 return SDValue();
51703
51704 SDValue SetCC0 = N->getOperand(0);
51705 SDValue SetCC1 = N->getOperand(1);
51706 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51707 SetCC1.getOpcode() != X86ISD::SETCC)
51708 return SDValue();
51709
51710 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51711 SDValue Op = V.getOperand(1);
51712 unsigned Opc = Op.getOpcode();
51713 if (Opc == X86ISD::SUB)
51714 return X86ISD::CCMP;
51715 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51716 return X86ISD::CTEST;
51717 return 0U;
51718 };
51719
51720 unsigned NewOpc = 0;
51721
51722 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51723 // appear on the right.
51724 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51725 std::swap(SetCC0, SetCC1);
51726 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51727 return SDValue();
51728 }
51729
51730 X86::CondCode CC0 =
51731 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51732 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51733 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51734 return SDValue();
51735
51736 bool IsOR = N->getOpcode() == ISD::OR;
51737
51738 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51739 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51740 // operator is OR. Similar for CC1.
51741 SDValue SrcCC =
51743 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51744 : SetCC0.getOperand(0);
51745 SDValue CC1N = SetCC1.getOperand(0);
51746 X86::CondCode CC1 =
51747 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51749 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51750 SDLoc DL(N);
51751 SDValue CFlags = DAG.getTargetConstant(
51752 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51753 SDValue Sub = SetCC1.getOperand(1);
51754
51755 // Replace any uses of the old flag produced by SUB/CMP with the new one
51756 // produced by CCMP/CTEST.
51757 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51758 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51759 {Sub.getOperand(0), Sub.getOperand(1),
51760 CFlags, SrcCC, SetCC0.getOperand(1)})
51761 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51762 {Sub.getOperand(0), Sub.getOperand(0),
51763 CFlags, SrcCC, SetCC0.getOperand(1)});
51764
51765 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51766}
51767
51770 const X86Subtarget &Subtarget) {
51771 using namespace SDPatternMatch;
51772
51773 SDValue N0 = N->getOperand(0);
51774 SDValue N1 = N->getOperand(1);
51775 EVT VT = N->getValueType(0);
51776 SDLoc dl(N);
51777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51778
51779 // If this is SSE1 only convert to FAND to avoid scalarization.
51780 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51781 return DAG.getBitcast(MVT::v4i32,
51782 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51783 DAG.getBitcast(MVT::v4f32, N0),
51784 DAG.getBitcast(MVT::v4f32, N1)));
51785 }
51786
51787 // Use a 32-bit and+zext if upper bits known zero.
51788 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51789 APInt HiMask = APInt::getHighBitsSet(64, 32);
51790 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51791 DAG.MaskedValueIsZero(N0, HiMask)) {
51792 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51793 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51794 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51795 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51796 }
51797 }
51798
51799 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51800 // TODO: Support multiple SrcOps.
51801 if (VT == MVT::i1) {
51803 SmallVector<APInt, 2> SrcPartials;
51804 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51805 SrcOps.size() == 1) {
51806 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51807 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51808 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51809 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51810 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51811 if (Mask) {
51812 assert(SrcPartials[0].getBitWidth() == NumElts &&
51813 "Unexpected partial reduction mask");
51814 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51815 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51816 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51817 }
51818 }
51819 }
51820
51821 // InstCombine converts:
51822 // `(-x << C0) & C1`
51823 // to
51824 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51825 // This saves an IR instruction but on x86 the neg/shift version is preferable
51826 // so undo the transform.
51827
51828 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51829 // TODO: We don't actually need a splat for this, we just need the checks to
51830 // hold for each element.
51831 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51832 /*AllowTruncation*/ false);
51833 ConstantSDNode *N01C =
51834 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51835 /*AllowTruncation*/ false);
51836 if (N1C && N01C) {
51837 const APInt &MulC = N01C->getAPIntValue();
51838 const APInt &AndC = N1C->getAPIntValue();
51839 APInt MulCLowBit = MulC & (-MulC);
51840 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51841 (MulCLowBit + MulC).isPowerOf2()) {
51842 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51843 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51844 assert(MulCLowBitLog != -1 &&
51845 "Isolated lowbit is somehow not a power of 2!");
51846 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51847 DAG.getConstant(MulCLowBitLog, dl, VT));
51848 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51849 }
51850 }
51851 }
51852
51853 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51854 return SetCC;
51855
51856 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51857 return V;
51858
51859 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51860 return R;
51861
51862 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51863 return R;
51864
51865 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51866 return R;
51867
51868 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51869 DAG, DCI, Subtarget))
51870 return FPLogic;
51871
51872 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51873 return R;
51874
51875 if (DCI.isBeforeLegalizeOps())
51876 return SDValue();
51877
51878 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51879 return R;
51880
51881 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51882 return R;
51883
51884 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51885 return ShiftRight;
51886
51887 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51888 return R;
51889
51890 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51891 return R;
51892
51893 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51894 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51895 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51896 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51897 unsigned Opc0 = N0.getOpcode();
51898 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51900 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51901 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51902 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51903 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51904 }
51905 }
51906
51907 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51908 // to make use of predicated selects.
51909 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51910 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51911 SDValue X, Y;
51912 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51913 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51914 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51915 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51918 m_Value(Y), m_SpecificVT(CondVT),
51919 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51920 return DAG.getSelect(dl, VT, Y, X,
51921 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51922 }
51923 }
51924
51925 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51926 // avoids slow variable shift (moving shift amount to ECX etc.)
51927 if (isOneConstant(N1) && N0->hasOneUse()) {
51928 SDValue Src = N0;
51929 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51930 Src.getOpcode() == ISD::TRUNCATE) &&
51931 Src.getOperand(0)->hasOneUse())
51932 Src = Src.getOperand(0);
51933 bool ContainsNOT = false;
51934 X86::CondCode X86CC = X86::COND_B;
51935 // Peek through AND(NOT(SRL(X,Y)),1).
51936 if (isBitwiseNot(Src)) {
51937 Src = Src.getOperand(0);
51938 X86CC = X86::COND_AE;
51939 ContainsNOT = true;
51940 }
51941 if (Src.getOpcode() == ISD::SRL &&
51942 !isa<ConstantSDNode>(Src.getOperand(1))) {
51943 SDValue BitNo = Src.getOperand(1);
51944 Src = Src.getOperand(0);
51945 // Peek through AND(SRL(NOT(X),Y),1).
51946 if (isBitwiseNot(Src)) {
51947 Src = Src.getOperand(0);
51948 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51949 ContainsNOT = true;
51950 }
51951 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51952 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51953 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51954 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51955 }
51956 }
51957
51958 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51959 // Attempt to recursively combine a bitmask AND with shuffles.
51960 SDValue Op(N, 0);
51961 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51962 return Res;
51963
51964 // If either operand is a constant mask, then only the elements that aren't
51965 // zero are actually demanded by the other operand.
51966 auto GetDemandedMasks = [&](SDValue Op) {
51967 APInt UndefElts;
51968 SmallVector<APInt> EltBits;
51969 int NumElts = VT.getVectorNumElements();
51970 int EltSizeInBits = VT.getScalarSizeInBits();
51971 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51972 APInt DemandedElts = APInt::getAllOnes(NumElts);
51973 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51974 EltBits)) {
51975 DemandedBits.clearAllBits();
51976 DemandedElts.clearAllBits();
51977 for (int I = 0; I != NumElts; ++I) {
51978 if (UndefElts[I]) {
51979 // We can't assume an undef src element gives an undef dst - the
51980 // other src might be zero.
51981 DemandedBits.setAllBits();
51982 DemandedElts.setBit(I);
51983 } else if (!EltBits[I].isZero()) {
51984 DemandedBits |= EltBits[I];
51985 DemandedElts.setBit(I);
51986 }
51987 }
51988 }
51989 return std::make_pair(DemandedBits, DemandedElts);
51990 };
51991 APInt Bits0, Elts0;
51992 APInt Bits1, Elts1;
51993 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51994 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51995
51996 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51997 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51998 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51999 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52000 if (N->getOpcode() != ISD::DELETED_NODE)
52001 DCI.AddToWorklist(N);
52002 return SDValue(N, 0);
52003 }
52004
52005 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52006 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52007 if (NewN0 || NewN1)
52008 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52009 NewN1 ? NewN1 : N1);
52010 }
52011
52012 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52013 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52015 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52016 SDValue BitMask = N1;
52017 SDValue SrcVec = N0.getOperand(0);
52018 EVT SrcVecVT = SrcVec.getValueType();
52019
52020 // Check that the constant bitmask masks whole bytes.
52021 APInt UndefElts;
52022 SmallVector<APInt, 64> EltBits;
52023 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52024 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52025 llvm::all_of(EltBits, [](const APInt &M) {
52026 return M.isZero() || M.isAllOnes();
52027 })) {
52028 unsigned NumElts = SrcVecVT.getVectorNumElements();
52029 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52030 unsigned Idx = N0.getConstantOperandVal(1);
52031
52032 // Create a root shuffle mask from the byte mask and the extracted index.
52033 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52034 for (unsigned i = 0; i != Scale; ++i) {
52035 if (UndefElts[i])
52036 continue;
52037 int VecIdx = Scale * Idx + i;
52038 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52039 }
52040
52042 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52043 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52044 /*AllowVariableCrossLaneMask=*/true,
52045 /*AllowVariablePerLaneMask=*/true,
52046 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52047 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52048 N0.getOperand(1));
52049 }
52050 }
52051
52052 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52053 return R;
52054
52055 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52056 return R;
52057
52058 return SDValue();
52059}
52060
52061// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52063 SelectionDAG &DAG,
52064 const X86Subtarget &Subtarget) {
52065 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52066
52067 MVT VT = N->getSimpleValueType(0);
52068 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52069 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52070 return SDValue();
52071
52072 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52073 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52074 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52075 return SDValue();
52076
52077 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52078 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52079 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52080 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52081 return SDValue();
52082
52083 // Attempt to extract constant byte masks.
52084 APInt UndefElts0, UndefElts1;
52085 SmallVector<APInt, 32> EltBits0, EltBits1;
52086 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52087 /*AllowWholeUndefs*/ false,
52088 /*AllowPartialUndefs*/ false))
52089 return SDValue();
52090 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52091 /*AllowWholeUndefs*/ false,
52092 /*AllowPartialUndefs*/ false))
52093 return SDValue();
52094
52095 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52096 // TODO - add UNDEF elts support.
52097 if (UndefElts0[i] || UndefElts1[i])
52098 return SDValue();
52099 if (EltBits0[i] != ~EltBits1[i])
52100 return SDValue();
52101 }
52102
52103 if (useVPTERNLOG(Subtarget, VT)) {
52104 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52105 // VPTERNLOG is only available as vXi32/64-bit types.
52106 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52107 MVT OpVT =
52108 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52109 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52110 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52111 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52112 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52113 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52114 DAG, Subtarget);
52115 return DAG.getBitcast(VT, Res);
52116 }
52117
52118 SDValue X = N->getOperand(0);
52119 SDValue Y =
52120 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52121 DAG.getBitcast(VT, N1.getOperand(0)));
52122 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52123}
52124
52125// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52126// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52127// Waiting for ANDNP combine allows other combines to happen that prevent
52128// matching.
52129static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52130 using namespace SDPatternMatch;
52131 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52132 m_And(m_Deferred(Mask), m_Value(Y))));
52133}
52134
52135// Try to fold:
52136// (or (and (m, y), (pandn m, x)))
52137// into:
52138// (vselect m, x, y)
52139// As a special case, try to fold:
52140// (or (and (m, (sub 0, x)), (pandn m, x)))
52141// into:
52142// (sub (xor X, M), M)
52144 SelectionDAG &DAG,
52145 const X86Subtarget &Subtarget) {
52146 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52147
52148 EVT VT = N->getValueType(0);
52149 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52150 (VT.is256BitVector() && Subtarget.hasInt256())))
52151 return SDValue();
52152
52153 SDValue X, Y, Mask;
52154 if (!matchLogicBlend(N, X, Y, Mask))
52155 return SDValue();
52156
52157 // Validate that X, Y, and Mask are bitcasts, and see through them.
52158 Mask = peekThroughBitcasts(Mask);
52161
52162 EVT MaskVT = Mask.getValueType();
52163 unsigned EltBits = MaskVT.getScalarSizeInBits();
52164
52165 // TODO: Attempt to handle floating point cases as well?
52166 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52167 return SDValue();
52168
52169 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52170 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52171 DAG, Subtarget))
52172 return Res;
52173
52174 // PBLENDVB is only available on SSE 4.1.
52175 if (!Subtarget.hasSSE41())
52176 return SDValue();
52177
52178 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52179 if (Subtarget.hasVLX())
52180 return SDValue();
52181
52182 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52183
52184 X = DAG.getBitcast(BlendVT, X);
52185 Y = DAG.getBitcast(BlendVT, Y);
52186 Mask = DAG.getBitcast(BlendVT, Mask);
52187 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52188 return DAG.getBitcast(VT, Mask);
52189}
52190
52191// Helper function for combineOrCmpEqZeroToCtlzSrl
52192// Transforms:
52193// seteq(cmp x, 0)
52194// into:
52195// srl(ctlz x), log2(bitsize(x))
52196// Input pattern is checked by caller.
52198 SDValue Cmp = Op.getOperand(1);
52199 EVT VT = Cmp.getOperand(0).getValueType();
52200 unsigned Log2b = Log2_32(VT.getSizeInBits());
52201 SDLoc dl(Op);
52202 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52203 // The result of the shift is true or false, and on X86, the 32-bit
52204 // encoding of shr and lzcnt is more desirable.
52205 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52206 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52207 DAG.getConstant(Log2b, dl, MVT::i8));
52208 return Scc;
52209}
52210
52211// Try to transform:
52212// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52213// into:
52214// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52215// Will also attempt to match more generic cases, eg:
52216// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52217// Only applies if the target supports the FastLZCNT feature.
52220 const X86Subtarget &Subtarget) {
52221 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52222 return SDValue();
52223
52224 auto isORCandidate = [](SDValue N) {
52225 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52226 };
52227
52228 // Check the zero extend is extending to 32-bit or more. The code generated by
52229 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52230 // instructions to clear the upper bits.
52231 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52232 !isORCandidate(N->getOperand(0)))
52233 return SDValue();
52234
52235 // Check the node matches: setcc(eq, cmp 0)
52236 auto isSetCCCandidate = [](SDValue N) {
52237 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52238 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52239 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52240 isNullConstant(N->getOperand(1).getOperand(1)) &&
52241 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52242 };
52243
52244 SDNode *OR = N->getOperand(0).getNode();
52245 SDValue LHS = OR->getOperand(0);
52246 SDValue RHS = OR->getOperand(1);
52247
52248 // Save nodes matching or(or, setcc(eq, cmp 0)).
52250 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52251 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52252 ORNodes.push_back(OR);
52253 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52254 LHS = OR->getOperand(0);
52255 RHS = OR->getOperand(1);
52256 }
52257
52258 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52259 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52260 !isORCandidate(SDValue(OR, 0)))
52261 return SDValue();
52262
52263 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52264 // to
52265 // or(srl(ctlz),srl(ctlz)).
52266 // The dag combiner can then fold it into:
52267 // srl(or(ctlz, ctlz)).
52268 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52269 SDValue Ret, NewRHS;
52270 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52271 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52272
52273 if (!Ret)
52274 return SDValue();
52275
52276 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52277 while (!ORNodes.empty()) {
52278 OR = ORNodes.pop_back_val();
52279 LHS = OR->getOperand(0);
52280 RHS = OR->getOperand(1);
52281 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52282 if (RHS->getOpcode() == ISD::OR)
52283 std::swap(LHS, RHS);
52284 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52285 if (!NewRHS)
52286 return SDValue();
52287 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52288 }
52289
52290 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52291}
52292
52293/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52294/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52295/// with CMP+{ADC, SBB}.
52296/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52297static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52298 SDValue X, SDValue Y,
52299 SelectionDAG &DAG,
52300 bool ZeroSecondOpOnly = false) {
52301 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52302 return SDValue();
52303
52304 // Look through a one-use zext.
52305 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52306 Y = Y.getOperand(0);
52307
52308 X86::CondCode CC;
52309 SDValue EFLAGS;
52310 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52311 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52312 EFLAGS = Y.getOperand(1);
52313 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52314 Y.hasOneUse()) {
52315 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52316 }
52317
52318 if (!EFLAGS)
52319 return SDValue();
52320
52321 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52322 // the general case below.
52323 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52324 if (ConstantX && !ZeroSecondOpOnly) {
52325 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52326 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52327 // This is a complicated way to get -1 or 0 from the carry flag:
52328 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52329 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52330 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52331 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52332 EFLAGS);
52333 }
52334
52335 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52336 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52337 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52338 EFLAGS.getValueType().isInteger() &&
52339 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52340 // Swap the operands of a SUB, and we have the same pattern as above.
52341 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52342 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52343 SDValue NewSub = DAG.getNode(
52344 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52345 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52346 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52347 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52348 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52349 NewEFLAGS);
52350 }
52351 }
52352 }
52353
52354 if (CC == X86::COND_B) {
52355 // X + SETB Z --> adc X, 0
52356 // X - SETB Z --> sbb X, 0
52357 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52358 DAG.getVTList(VT, MVT::i32), X,
52359 DAG.getConstant(0, DL, VT), EFLAGS);
52360 }
52361
52362 if (ZeroSecondOpOnly)
52363 return SDValue();
52364
52365 if (CC == X86::COND_A) {
52366 // Try to convert COND_A into COND_B in an attempt to facilitate
52367 // materializing "setb reg".
52368 //
52369 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52370 // cannot take an immediate as its first operand.
52371 //
52372 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52373 EFLAGS.getValueType().isInteger() &&
52374 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52375 SDValue NewSub =
52376 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52377 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52378 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52379 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52380 DAG.getVTList(VT, MVT::i32), X,
52381 DAG.getConstant(0, DL, VT), NewEFLAGS);
52382 }
52383 }
52384
52385 if (CC == X86::COND_AE) {
52386 // X + SETAE --> sbb X, -1
52387 // X - SETAE --> adc X, -1
52388 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52389 DAG.getVTList(VT, MVT::i32), X,
52390 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52391 }
52392
52393 if (CC == X86::COND_BE) {
52394 // X + SETBE --> sbb X, -1
52395 // X - SETBE --> adc X, -1
52396 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52397 // materializing "setae reg".
52398 //
52399 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52400 // cannot take an immediate as its first operand.
52401 //
52402 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52403 EFLAGS.getValueType().isInteger() &&
52404 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52405 SDValue NewSub =
52406 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52407 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52408 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52409 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52410 DAG.getVTList(VT, MVT::i32), X,
52411 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52412 }
52413 }
52414
52415 if (CC != X86::COND_E && CC != X86::COND_NE)
52416 return SDValue();
52417
52418 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52419 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52420 !EFLAGS.getOperand(0).getValueType().isInteger())
52421 return SDValue();
52422
52423 SDValue Z = EFLAGS.getOperand(0);
52424 EVT ZVT = Z.getValueType();
52425
52426 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52427 // the general case below.
52428 if (ConstantX) {
52429 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52430 // fake operands:
52431 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52432 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52433 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52434 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52435 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52436 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52437 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52438 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52439 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52440 SDValue(Neg.getNode(), 1));
52441 }
52442
52443 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52444 // with fake operands:
52445 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52446 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52447 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52448 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52449 SDValue One = DAG.getConstant(1, DL, ZVT);
52450 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52451 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52452 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52453 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52454 Cmp1.getValue(1));
52455 }
52456 }
52457
52458 // (cmp Z, 1) sets the carry flag if Z is 0.
52459 SDValue One = DAG.getConstant(1, DL, ZVT);
52460 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52461 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52462
52463 // Add the flags type for ADC/SBB nodes.
52464 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52465
52466 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52467 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52468 if (CC == X86::COND_NE)
52469 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52470 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52471
52472 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52473 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52474 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52475 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52476}
52477
52478/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52479/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52480/// with CMP+{ADC, SBB}.
52482 SelectionDAG &DAG) {
52483 bool IsSub = N->getOpcode() == ISD::SUB;
52484 SDValue X = N->getOperand(0);
52485 SDValue Y = N->getOperand(1);
52486 EVT VT = N->getValueType(0);
52487
52488 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52489 return ADCOrSBB;
52490
52491 // Commute and try again (negate the result for subtracts).
52492 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52493 if (IsSub)
52494 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52495 return ADCOrSBB;
52496 }
52497
52498 return SDValue();
52499}
52500
52501static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52502 SDValue N0, SDValue N1,
52503 SelectionDAG &DAG) {
52504 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52505
52506 // Delegate to combineAddOrSubToADCOrSBB if we have:
52507 //
52508 // (xor/or (zero_extend (setcc)) imm)
52509 //
52510 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52511 // equivalent to a SUB/ADD, respectively.
52512 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52513 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52514 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52515 bool IsSub = Opc == ISD::XOR;
52516 bool N1COdd = N1C->getZExtValue() & 1;
52517 if (IsSub ? N1COdd : !N1COdd)
52518 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52519 return R;
52520 }
52521 }
52522
52523 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52524 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52525 N0.getOperand(0).getOpcode() == ISD::AND &&
52529 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52530 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52531 N0.getOperand(0).getOperand(1));
52532 }
52533
52534 return SDValue();
52535}
52536
52539 const X86Subtarget &Subtarget) {
52540 SDValue N0 = N->getOperand(0);
52541 SDValue N1 = N->getOperand(1);
52542 EVT VT = N->getValueType(0);
52543 SDLoc dl(N);
52544 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52545
52546 // If this is SSE1 only convert to FOR to avoid scalarization.
52547 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52548 return DAG.getBitcast(MVT::v4i32,
52549 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52550 DAG.getBitcast(MVT::v4f32, N0),
52551 DAG.getBitcast(MVT::v4f32, N1)));
52552 }
52553
52554 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52555 // TODO: Support multiple SrcOps.
52556 if (VT == MVT::i1) {
52558 SmallVector<APInt, 2> SrcPartials;
52559 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52560 SrcOps.size() == 1) {
52561 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52562 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52563 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52564 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52565 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52566 if (Mask) {
52567 assert(SrcPartials[0].getBitWidth() == NumElts &&
52568 "Unexpected partial reduction mask");
52569 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52570 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52571 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52572 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52573 }
52574 }
52575 }
52576
52577 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52578 return SetCC;
52579
52580 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52581 return R;
52582
52583 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52584 return R;
52585
52586 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52587 return R;
52588
52589 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52590 DAG, DCI, Subtarget))
52591 return FPLogic;
52592
52593 if (DCI.isBeforeLegalizeOps())
52594 return SDValue();
52595
52596 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52597 return R;
52598
52599 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52600 return R;
52601
52602 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52603 return R;
52604
52605 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52606 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52607 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52608 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52609 uint64_t Val = CN->getZExtValue();
52610 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52611 Val == 8) {
52612 SDValue NotCond;
52613 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52614 N0.getOperand(1).hasOneUse()) {
52617 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52618 } else if (N0.getOpcode() == ISD::SUB &&
52619 isNullConstant(N0.getOperand(0))) {
52620 SDValue Cond = N0.getOperand(1);
52621 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52622 Cond = Cond.getOperand(0);
52623 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52624 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52626 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52627 }
52628 }
52629
52630 if (NotCond) {
52631 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52632 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52633 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52634 return R;
52635 }
52636 }
52637 }
52638 }
52639
52640 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52641 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52642 // iff the upper elements of the non-shifted arg are zero.
52643 // KUNPCK require 16+ bool vector elements.
52644 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52645 unsigned NumElts = VT.getVectorNumElements();
52646 unsigned HalfElts = NumElts / 2;
52647 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52648 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52649 N1.getConstantOperandAPInt(1) == HalfElts &&
52650 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52651 return DAG.getNode(
52652 ISD::CONCAT_VECTORS, dl, VT,
52653 extractSubVector(N0, 0, DAG, dl, HalfElts),
52654 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52655 }
52656 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52657 N0.getConstantOperandAPInt(1) == HalfElts &&
52658 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52659 return DAG.getNode(
52660 ISD::CONCAT_VECTORS, dl, VT,
52661 extractSubVector(N1, 0, DAG, dl, HalfElts),
52662 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52663 }
52664 }
52665
52666 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52667 // Attempt to recursively combine an OR of shuffles.
52668 SDValue Op(N, 0);
52669 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52670 return Res;
52671
52672 // If either operand is a constant mask, then only the elements that aren't
52673 // allones are actually demanded by the other operand.
52674 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52675 APInt UndefElts;
52676 SmallVector<APInt> EltBits;
52677 int NumElts = VT.getVectorNumElements();
52678 int EltSizeInBits = VT.getScalarSizeInBits();
52679 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52680 return false;
52681
52682 APInt DemandedElts = APInt::getZero(NumElts);
52683 for (int I = 0; I != NumElts; ++I)
52684 if (!EltBits[I].isAllOnes())
52685 DemandedElts.setBit(I);
52686
52687 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52688 };
52689 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52690 if (N->getOpcode() != ISD::DELETED_NODE)
52691 DCI.AddToWorklist(N);
52692 return SDValue(N, 0);
52693 }
52694 }
52695
52696 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52697 return R;
52698
52699 return SDValue();
52700}
52701
52702/// Try to turn tests against the signbit in the form of:
52703/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52704/// into:
52705/// SETGT(X, -1)
52707 SelectionDAG &DAG) {
52708 // This is only worth doing if the output type is i8 or i1.
52709 EVT ResultType = N->getValueType(0);
52710 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52711 return SDValue();
52712
52713 SDValue N0 = N->getOperand(0);
52714 SDValue N1 = N->getOperand(1);
52715
52716 // We should be performing an xor against a truncated shift.
52717 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52718 return SDValue();
52719
52720 // Make sure we are performing an xor against one.
52721 if (!isOneConstant(N1))
52722 return SDValue();
52723
52724 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52725 SDValue Shift = N0.getOperand(0);
52726 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52727 return SDValue();
52728
52729 // Make sure we are truncating from one of i16, i32 or i64.
52730 EVT ShiftTy = Shift.getValueType();
52731 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52732 return SDValue();
52733
52734 // Make sure the shift amount extracts the sign bit.
52735 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52736 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52737 return SDValue();
52738
52739 // Create a greater-than comparison against -1.
52740 // N.B. Using SETGE against 0 works but we want a canonical looking
52741 // comparison, using SETGT matches up with what TranslateX86CC.
52742 SDValue ShiftOp = Shift.getOperand(0);
52743 EVT ShiftOpTy = ShiftOp.getValueType();
52744 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52745 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52746 *DAG.getContext(), ResultType);
52747 SDValue Cond =
52748 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52749 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52750 if (SetCCResultType != ResultType)
52751 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52752 return Cond;
52753}
52754
52755/// Turn vector tests of the signbit in the form of:
52756/// xor (sra X, elt_size(X)-1), -1
52757/// into:
52758/// pcmpgt X, -1
52759///
52760/// This should be called before type legalization because the pattern may not
52761/// persist after that.
52763 const X86Subtarget &Subtarget) {
52764 EVT VT = N->getValueType(0);
52765 if (!VT.isSimple())
52766 return SDValue();
52767
52768 switch (VT.getSimpleVT().SimpleTy) {
52769 // clang-format off
52770 default: return SDValue();
52771 case MVT::v16i8:
52772 case MVT::v8i16:
52773 case MVT::v4i32:
52774 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52775 case MVT::v32i8:
52776 case MVT::v16i16:
52777 case MVT::v8i32:
52778 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52779 // clang-format on
52780 }
52781
52782 // There must be a shift right algebraic before the xor, and the xor must be a
52783 // 'not' operation.
52784 SDValue Shift = N->getOperand(0);
52785 SDValue Ones = N->getOperand(1);
52786 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52788 return SDValue();
52789
52790 // The shift should be smearing the sign bit across each vector element.
52791 auto *ShiftAmt =
52792 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52793 if (!ShiftAmt ||
52794 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52795 return SDValue();
52796
52797 // Create a greater-than comparison against -1. We don't use the more obvious
52798 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52799 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52800}
52801
52802/// Detect patterns of truncation with unsigned saturation:
52803///
52804/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52805/// Return the source value x to be truncated or SDValue() if the pattern was
52806/// not matched.
52807///
52808/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52809/// where C1 >= 0 and C2 is unsigned max of destination type.
52810///
52811/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52812/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52813///
52814/// These two patterns are equivalent to:
52815/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52816/// So return the smax(x, C1) value to be truncated or SDValue() if the
52817/// pattern was not matched.
52819 const SDLoc &DL) {
52820 using namespace llvm::SDPatternMatch;
52821 EVT InVT = In.getValueType();
52822
52823 // Saturation with truncation. We truncate from InVT to VT.
52825 "Unexpected types for truncate operation");
52826
52827 APInt C1, C2;
52829
52830 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52831 // the element size of the destination type.
52832 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52833 C2.isMask(VT.getScalarSizeInBits()))
52834 return UMin;
52835
52836 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52838 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52839 return SMin;
52840
52841 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52843 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52844 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52845
52846 return SDValue();
52847}
52848
52849/// Detect patterns of truncation with signed saturation:
52850/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52851/// signed_max_of_dest_type)) to dest_type)
52852/// or:
52853/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52854/// signed_min_of_dest_type)) to dest_type).
52855/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52856/// Return the source value to be truncated or SDValue() if the pattern was not
52857/// matched.
52858static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52859 using namespace llvm::SDPatternMatch;
52860 unsigned NumDstBits = VT.getScalarSizeInBits();
52861 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52862 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52863
52864 APInt SignedMax, SignedMin;
52865 if (MatchPackUS) {
52866 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52867 SignedMin = APInt::getZero(NumSrcBits);
52868 } else {
52869 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52870 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52871 }
52872
52873 SDValue SMin, SMax;
52874 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52875 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52876 return SMax;
52877
52878 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52879 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52880 return SMin;
52881
52882 return SDValue();
52883}
52884
52886 SelectionDAG &DAG,
52887 const X86Subtarget &Subtarget) {
52888 if (!Subtarget.hasSSE2() || !VT.isVector())
52889 return SDValue();
52890
52891 EVT SVT = VT.getVectorElementType();
52892 EVT InVT = In.getValueType();
52893 EVT InSVT = InVT.getVectorElementType();
52894
52895 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52896 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52897 // and concatenate at the same time. Then we can use a final vpmovuswb to
52898 // clip to 0-255.
52899 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52900 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52901 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52902 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52903 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52904 DL, DAG, Subtarget);
52905 assert(Mid && "Failed to pack!");
52906 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52907 }
52908 }
52909
52910 // vXi32 truncate instructions are available with AVX512F.
52911 // vXi16 truncate instructions are only available with AVX512BW.
52912 // For 256-bit or smaller vectors, we require VLX.
52913 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52914 // If the result type is 256-bits or larger and we have disable 512-bit
52915 // registers, we should go ahead and use the pack instructions if possible.
52916 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52917 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52918 (InVT.getSizeInBits() > 128) &&
52919 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52920 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52921
52922 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52924 (SVT == MVT::i8 || SVT == MVT::i16) &&
52925 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52926 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52927 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52928 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52929 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52930 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52931 DAG, Subtarget);
52932 assert(Mid && "Failed to pack!");
52934 Subtarget);
52935 assert(V && "Failed to pack!");
52936 return V;
52937 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52938 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52939 Subtarget);
52940 }
52941 if (SDValue SSatVal = detectSSatPattern(In, VT))
52942 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52943 Subtarget);
52944 }
52945
52946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52947 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52948 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52949 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52950 unsigned TruncOpc = 0;
52951 SDValue SatVal;
52952 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52953 SatVal = SSatVal;
52954 TruncOpc = X86ISD::VTRUNCS;
52955 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52956 SatVal = USatVal;
52957 TruncOpc = X86ISD::VTRUNCUS;
52958 }
52959 if (SatVal) {
52960 unsigned ResElts = VT.getVectorNumElements();
52961 // If the input type is less than 512 bits and we don't have VLX, we need
52962 // to widen to 512 bits.
52963 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52964 unsigned NumConcats = 512 / InVT.getSizeInBits();
52965 ResElts *= NumConcats;
52966 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52967 ConcatOps[0] = SatVal;
52968 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52969 NumConcats * InVT.getVectorNumElements());
52970 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52971 }
52972 // Widen the result if its narrower than 128 bits.
52973 if (ResElts * SVT.getSizeInBits() < 128)
52974 ResElts = 128 / SVT.getSizeInBits();
52975 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52976 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52977 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52978 DAG.getVectorIdxConstant(0, DL));
52979 }
52980 }
52981
52982 return SDValue();
52983}
52984
52986 SelectionDAG &DAG,
52988 const X86Subtarget &Subtarget) {
52989 auto *Ld = cast<LoadSDNode>(N);
52990 EVT RegVT = Ld->getValueType(0);
52991 SDValue Ptr = Ld->getBasePtr();
52992 SDValue Chain = Ld->getChain();
52993 ISD::LoadExtType Ext = Ld->getExtensionType();
52994
52995 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52996 return SDValue();
52997
52998 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52999 return SDValue();
53000
53002 if (!LdC)
53003 return SDValue();
53004
53005 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53006 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53007 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53008 if (Undefs[I])
53009 continue;
53010 if (UserUndefs[I] || Bits[I] != UserBits[I])
53011 return false;
53012 }
53013 return true;
53014 };
53015
53016 // Look through all other loads/broadcasts in the chain for another constant
53017 // pool entry.
53018 for (SDNode *User : Chain->users()) {
53019 auto *UserLd = dyn_cast<MemSDNode>(User);
53020 if (User != N && UserLd &&
53021 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53022 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53024 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53025 User->getValueSizeInBits(0).getFixedValue() >
53026 RegVT.getFixedSizeInBits()) {
53027 EVT UserVT = User->getValueType(0);
53028 SDValue UserPtr = UserLd->getBasePtr();
53029 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53030
53031 // See if we are loading a constant that matches in the lower
53032 // bits of a longer constant (but from a different constant pool ptr).
53033 if (UserC && UserPtr != Ptr) {
53034 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53035 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53036 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53037 APInt Undefs, UserUndefs;
53038 SmallVector<APInt> Bits, UserBits;
53039 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53040 UserVT.getScalarSizeInBits());
53041 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53042 Bits) &&
53044 UserUndefs, UserBits)) {
53045 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53047 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53048 RegVT.getSizeInBits());
53049 Extract = DAG.getBitcast(RegVT, Extract);
53050 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53051 }
53052 }
53053 }
53054 }
53055 }
53056 }
53057
53058 return SDValue();
53059}
53060
53063 const X86Subtarget &Subtarget) {
53064 auto *Ld = cast<LoadSDNode>(N);
53065 EVT RegVT = Ld->getValueType(0);
53066 EVT MemVT = Ld->getMemoryVT();
53067 SDLoc dl(Ld);
53068 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53069
53070 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53071 // into two 16-byte operations. Also split non-temporal aligned loads on
53072 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53073 ISD::LoadExtType Ext = Ld->getExtensionType();
53074 unsigned Fast;
53075 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53076 Ext == ISD::NON_EXTLOAD &&
53077 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53078 Ld->getAlign() >= Align(16)) ||
53079 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53080 *Ld->getMemOperand(), &Fast) &&
53081 !Fast))) {
53082 unsigned NumElems = RegVT.getVectorNumElements();
53083 if (NumElems < 2)
53084 return SDValue();
53085
53086 unsigned HalfOffset = 16;
53087 SDValue Ptr1 = Ld->getBasePtr();
53088 SDValue Ptr2 =
53089 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53090 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53091 NumElems / 2);
53092 SDValue Load1 =
53093 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53094 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53095 SDValue Load2 =
53096 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53097 Ld->getPointerInfo().getWithOffset(HalfOffset),
53098 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53099 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53100 Load1.getValue(1), Load2.getValue(1));
53101
53102 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53103 return DCI.CombineTo(N, NewVec, TF, true);
53104 }
53105
53106 // Bool vector load - attempt to cast to an integer, as we have good
53107 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53108 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53109 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53110 unsigned NumElts = RegVT.getVectorNumElements();
53111 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53112 if (TLI.isTypeLegal(IntVT)) {
53113 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53114 Ld->getPointerInfo(), Ld->getBaseAlign(),
53115 Ld->getMemOperand()->getFlags());
53116 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53117 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53118 }
53119 }
53120
53121 // If we also broadcast this vector to a wider type, then just extract the
53122 // lowest subvector.
53123 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53124 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53125 SDValue Ptr = Ld->getBasePtr();
53126 SDValue Chain = Ld->getChain();
53127 for (SDNode *User : Chain->users()) {
53128 auto *UserLd = dyn_cast<MemSDNode>(User);
53129 if (User != N && UserLd &&
53130 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53131 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53132 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53133 User->hasAnyUseOfValue(0) &&
53134 User->getValueSizeInBits(0).getFixedValue() >
53135 RegVT.getFixedSizeInBits()) {
53137 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53138 RegVT.getSizeInBits());
53139 Extract = DAG.getBitcast(RegVT, Extract);
53140 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53141 }
53142 }
53143 }
53144
53145 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53146 return V;
53147
53148 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53149 unsigned AddrSpace = Ld->getAddressSpace();
53150 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53151 AddrSpace == X86AS::PTR32_UPTR) {
53152 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53153 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53154 SDValue Cast =
53155 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53156 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53157 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53158 Ld->getMemOperand()->getFlags());
53159 }
53160 }
53161
53162 return SDValue();
53163}
53164
53165/// If V is a build vector of boolean constants and exactly one of those
53166/// constants is true, return the operand index of that true element.
53167/// Otherwise, return -1.
53168static int getOneTrueElt(SDValue V) {
53169 // This needs to be a build vector of booleans.
53170 // TODO: Checking for the i1 type matches the IR definition for the mask,
53171 // but the mask check could be loosened to i8 or other types. That might
53172 // also require checking more than 'allOnesValue'; eg, the x86 HW
53173 // instructions only require that the MSB is set for each mask element.
53174 // The ISD::MSTORE comments/definition do not specify how the mask operand
53175 // is formatted.
53176 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53177 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53178 return -1;
53179
53180 int TrueIndex = -1;
53181 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53182 for (unsigned i = 0; i < NumElts; ++i) {
53183 const SDValue &Op = BV->getOperand(i);
53184 if (Op.isUndef())
53185 continue;
53186 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53187 if (!ConstNode)
53188 return -1;
53189 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53190 // If we already found a one, this is too many.
53191 if (TrueIndex >= 0)
53192 return -1;
53193 TrueIndex = i;
53194 }
53195 }
53196 return TrueIndex;
53197}
53198
53199/// Given a masked memory load/store operation, return true if it has one mask
53200/// bit set. If it has one mask bit set, then also return the memory address of
53201/// the scalar element to load/store, the vector index to insert/extract that
53202/// scalar element, and the alignment for the scalar memory access.
53204 SelectionDAG &DAG, SDValue &Addr,
53205 SDValue &Index, Align &Alignment,
53206 unsigned &Offset) {
53207 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53208 if (TrueMaskElt < 0)
53209 return false;
53210
53211 // Get the address of the one scalar element that is specified by the mask
53212 // using the appropriate offset from the base pointer.
53213 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53214 Offset = 0;
53215 Addr = MaskedOp->getBasePtr();
53216 if (TrueMaskElt != 0) {
53217 Offset = TrueMaskElt * EltVT.getStoreSize();
53219 SDLoc(MaskedOp));
53220 }
53221
53222 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53223 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53224 return true;
53225}
53226
53227/// If exactly one element of the mask is set for a non-extending masked load,
53228/// it is a scalar load and vector insert.
53229/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53230/// mask have already been optimized in IR, so we don't bother with those here.
53231static SDValue
53234 const X86Subtarget &Subtarget) {
53235 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53236 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53237 // However, some target hooks may need to be added to know when the transform
53238 // is profitable. Endianness would also have to be considered.
53239
53240 SDValue Addr, VecIndex;
53241 Align Alignment;
53242 unsigned Offset;
53243 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53244 return SDValue();
53245
53246 // Load the one scalar element that is specified by the mask using the
53247 // appropriate offset from the base pointer.
53248 SDLoc DL(ML);
53249 EVT VT = ML->getValueType(0);
53250 EVT EltVT = VT.getVectorElementType();
53251
53252 EVT CastVT = VT;
53253 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53254 EltVT = MVT::f64;
53255 CastVT = VT.changeVectorElementType(EltVT);
53256 }
53257
53258 SDValue Load =
53259 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53260 ML->getPointerInfo().getWithOffset(Offset),
53261 Alignment, ML->getMemOperand()->getFlags());
53262
53263 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53264
53265 // Insert the loaded element into the appropriate place in the vector.
53266 SDValue Insert =
53267 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53268 Insert = DAG.getBitcast(VT, Insert);
53269 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53270}
53271
53272static SDValue
53275 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53276 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53277 return SDValue();
53278
53279 SDLoc DL(ML);
53280 EVT VT = ML->getValueType(0);
53281
53282 // If we are loading the first and last elements of a vector, it is safe and
53283 // always faster to load the whole vector. Replace the masked load with a
53284 // vector load and select.
53285 unsigned NumElts = VT.getVectorNumElements();
53286 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53287 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53288 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53289 if (LoadFirstElt && LoadLastElt) {
53290 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53291 ML->getMemOperand());
53292 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53293 ML->getPassThru());
53294 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53295 }
53296
53297 // Convert a masked load with a constant mask into a masked load and a select.
53298 // This allows the select operation to use a faster kind of select instruction
53299 // (for example, vblendvps -> vblendps).
53300
53301 // Don't try this if the pass-through operand is already undefined. That would
53302 // cause an infinite loop because that's what we're about to create.
53303 if (ML->getPassThru().isUndef())
53304 return SDValue();
53305
53306 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53307 return SDValue();
53308
53309 // The new masked load has an undef pass-through operand. The select uses the
53310 // original pass-through operand.
53311 SDValue NewML = DAG.getMaskedLoad(
53312 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53313 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53314 ML->getAddressingMode(), ML->getExtensionType());
53315 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53316 ML->getPassThru());
53317
53318 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53319}
53320
53323 const X86Subtarget &Subtarget) {
53324 auto *Mld = cast<MaskedLoadSDNode>(N);
53325
53326 // TODO: Expanding load with constant mask may be optimized as well.
53327 if (Mld->isExpandingLoad())
53328 return SDValue();
53329
53330 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53331 if (SDValue ScalarLoad =
53332 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53333 return ScalarLoad;
53334
53335 // TODO: Do some AVX512 subsets benefit from this transform?
53336 if (!Subtarget.hasAVX512())
53337 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53338 return Blend;
53339 }
53340
53341 // If the mask value has been legalized to a non-boolean vector, try to
53342 // simplify ops leading up to it. We only demand the MSB of each lane.
53343 SDValue Mask = Mld->getMask();
53344 if (Mask.getScalarValueSizeInBits() != 1) {
53345 EVT VT = Mld->getValueType(0);
53346 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53348 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53349 if (N->getOpcode() != ISD::DELETED_NODE)
53350 DCI.AddToWorklist(N);
53351 return SDValue(N, 0);
53352 }
53353 if (SDValue NewMask =
53355 return DAG.getMaskedLoad(
53356 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53357 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53358 Mld->getAddressingMode(), Mld->getExtensionType());
53359 }
53360
53361 return SDValue();
53362}
53363
53364/// If exactly one element of the mask is set for a non-truncating masked store,
53365/// it is a vector extract and scalar store.
53366/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53367/// mask have already been optimized in IR, so we don't bother with those here.
53369 SelectionDAG &DAG,
53370 const X86Subtarget &Subtarget) {
53371 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53372 // However, some target hooks may need to be added to know when the transform
53373 // is profitable. Endianness would also have to be considered.
53374
53375 SDValue Addr, VecIndex;
53376 Align Alignment;
53377 unsigned Offset;
53378 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53379 return SDValue();
53380
53381 // Extract the one scalar element that is actually being stored.
53382 SDLoc DL(MS);
53383 SDValue Value = MS->getValue();
53384 EVT VT = Value.getValueType();
53385 EVT EltVT = VT.getVectorElementType();
53386 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53387 EltVT = MVT::f64;
53388 EVT CastVT = VT.changeVectorElementType(EltVT);
53389 Value = DAG.getBitcast(CastVT, Value);
53390 }
53391 SDValue Extract =
53392 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53393
53394 // Store that element at the appropriate offset from the base pointer.
53395 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53397 Alignment, MS->getMemOperand()->getFlags());
53398}
53399
53402 const X86Subtarget &Subtarget) {
53404 if (Mst->isCompressingStore())
53405 return SDValue();
53406
53407 EVT VT = Mst->getValue().getValueType();
53408 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53409
53410 if (Mst->isTruncatingStore())
53411 return SDValue();
53412
53413 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53414 return ScalarStore;
53415
53416 // If the mask value has been legalized to a non-boolean vector, try to
53417 // simplify ops leading up to it. We only demand the MSB of each lane.
53418 SDValue Mask = Mst->getMask();
53419 if (Mask.getScalarValueSizeInBits() != 1) {
53421 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53422 if (N->getOpcode() != ISD::DELETED_NODE)
53423 DCI.AddToWorklist(N);
53424 return SDValue(N, 0);
53425 }
53426 if (SDValue NewMask =
53428 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53429 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53430 Mst->getMemoryVT(), Mst->getMemOperand(),
53431 Mst->getAddressingMode());
53432 }
53433
53434 SDValue Value = Mst->getValue();
53435 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53436 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53437 Mst->getMemoryVT())) {
53438 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53439 Mst->getBasePtr(), Mst->getOffset(), Mask,
53440 Mst->getMemoryVT(), Mst->getMemOperand(),
53441 Mst->getAddressingMode(), true);
53442 }
53443
53444 return SDValue();
53445}
53446
53449 const X86Subtarget &Subtarget) {
53451 EVT StVT = St->getMemoryVT();
53452 SDLoc dl(St);
53453 SDValue StoredVal = St->getValue();
53454 EVT VT = StoredVal.getValueType();
53455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53456
53457 // Convert a store of vXi1 into a store of iX and a bitcast.
53458 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53459 VT.getVectorElementType() == MVT::i1) {
53460
53462 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53463
53464 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53465 St->getPointerInfo(), St->getBaseAlign(),
53466 St->getMemOperand()->getFlags());
53467 }
53468
53469 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53470 // This will avoid a copy to k-register.
53471 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53472 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53473 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53474 SDValue Val = StoredVal.getOperand(0);
53475 // We must store zeros to the unused bits.
53476 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53477 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53478 St->getPointerInfo(), St->getBaseAlign(),
53479 St->getMemOperand()->getFlags());
53480 }
53481
53482 // Widen v2i1/v4i1 stores to v8i1.
53483 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53484 Subtarget.hasAVX512()) {
53485 unsigned NumConcats = 8 / VT.getVectorNumElements();
53486 // We must store zeros to the unused bits.
53487 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53488 Ops[0] = StoredVal;
53489 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53490 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53491 St->getPointerInfo(), St->getBaseAlign(),
53492 St->getMemOperand()->getFlags());
53493 }
53494
53495 // Turn vXi1 stores of constants into a scalar store.
53496 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53497 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53499 // If its a v64i1 store without 64-bit support, we need two stores.
53500 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53501 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53502 StoredVal->ops().slice(0, 32));
53504 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53505 StoredVal->ops().slice(32, 32));
53507
53508 SDValue Ptr0 = St->getBasePtr();
53509 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53510
53511 SDValue Ch0 =
53512 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53513 St->getBaseAlign(), St->getMemOperand()->getFlags());
53514 SDValue Ch1 = DAG.getStore(
53515 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53516 St->getBaseAlign(), St->getMemOperand()->getFlags());
53517 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53518 }
53519
53520 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53521 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53522 St->getPointerInfo(), St->getBaseAlign(),
53523 St->getMemOperand()->getFlags());
53524 }
53525
53526 // Convert scalar fabs/fneg load-store to integer equivalents.
53527 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53528 (StoredVal.getOpcode() == ISD::FABS ||
53529 StoredVal.getOpcode() == ISD::FNEG) &&
53530 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53531 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53532 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53533 if (TLI.isTypeLegal(IntVT)) {
53535 unsigned SignOp = ISD::XOR;
53536 if (StoredVal.getOpcode() == ISD::FABS) {
53537 SignMask = ~SignMask;
53538 SignOp = ISD::AND;
53539 }
53540 SDValue LogicOp = DAG.getNode(
53541 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53542 DAG.getConstant(SignMask, dl, IntVT));
53543 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53544 St->getPointerInfo(), St->getBaseAlign(),
53545 St->getMemOperand()->getFlags());
53546 }
53547 }
53548
53549 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53550 // Sandy Bridge, perform two 16-byte stores.
53551 unsigned Fast;
53552 if (VT.is256BitVector() && StVT == VT &&
53553 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53554 *St->getMemOperand(), &Fast) &&
53555 !Fast) {
53556 unsigned NumElems = VT.getVectorNumElements();
53557 if (NumElems < 2)
53558 return SDValue();
53559
53560 return splitVectorStore(St, DAG);
53561 }
53562
53563 // Split under-aligned vector non-temporal stores.
53564 if (St->isNonTemporal() && StVT == VT &&
53565 St->getAlign().value() < VT.getStoreSize()) {
53566 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53567 // vectors or the legalizer can scalarize it to use MOVNTI.
53568 if (VT.is256BitVector() || VT.is512BitVector()) {
53569 unsigned NumElems = VT.getVectorNumElements();
53570 if (NumElems < 2)
53571 return SDValue();
53572 return splitVectorStore(St, DAG);
53573 }
53574
53575 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53576 // to use MOVNTI.
53577 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53578 MVT NTVT = Subtarget.hasSSE4A()
53579 ? MVT::v2f64
53580 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53581 return scalarizeVectorStore(St, NTVT, DAG);
53582 }
53583 }
53584
53585 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53586 // supported, but avx512f is by extending to v16i32 and truncating.
53587 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53588 St->getValue().getOpcode() == ISD::TRUNCATE &&
53589 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53590 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53591 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53592 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53593 St->getValue().getOperand(0));
53594 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53595 MVT::v16i8, St->getMemOperand());
53596 }
53597
53598 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53599 if (!St->isTruncatingStore() &&
53600 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53601 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53602 StoredVal.hasOneUse() &&
53603 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53604 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53605 return EmitTruncSStore(IsSigned, St->getChain(),
53606 dl, StoredVal.getOperand(0), St->getBasePtr(),
53607 VT, St->getMemOperand(), DAG);
53608 }
53609
53610 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53611 if (!St->isTruncatingStore()) {
53612 auto IsExtractedElement = [](SDValue V) {
53613 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53614 V = V.getOperand(0);
53615 unsigned Opc = V.getOpcode();
53617 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53618 V.getOperand(0).hasOneUse())
53619 return V.getOperand(0);
53620 return SDValue();
53621 };
53622 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53623 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53624 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53625 SDValue Src = Trunc.getOperand(0);
53626 MVT DstVT = Trunc.getSimpleValueType();
53627 MVT SrcVT = Src.getSimpleValueType();
53628 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53629 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53630 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53631 if (NumTruncBits == VT.getSizeInBits() &&
53632 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53633 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53634 TruncVT, St->getMemOperand());
53635 }
53636 }
53637 }
53638 }
53639
53640 // Optimize trunc store (of multiple scalars) to shuffle and store.
53641 // First, pack all of the elements in one place. Next, store to memory
53642 // in fewer chunks.
53643 if (St->isTruncatingStore() && VT.isVector()) {
53644 if (TLI.isTruncStoreLegal(VT, StVT)) {
53645 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53646 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53647 dl, Val, St->getBasePtr(),
53648 St->getMemoryVT(), St->getMemOperand(), DAG);
53649 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53650 DAG, dl))
53651 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53652 dl, Val, St->getBasePtr(),
53653 St->getMemoryVT(), St->getMemOperand(), DAG);
53654 }
53655
53656 return SDValue();
53657 }
53658
53659 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53660 unsigned AddrSpace = St->getAddressSpace();
53661 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53662 AddrSpace == X86AS::PTR32_UPTR) {
53663 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53664 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53665 SDValue Cast =
53666 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53667 return DAG.getTruncStore(
53668 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53669 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53670 }
53671 }
53672
53673 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53674 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53675 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53676 Subtarget.hasCF() && St->isSimple()) {
53677 SDValue Cmov;
53678 if (StoredVal.getOpcode() == X86ISD::CMOV)
53679 Cmov = StoredVal;
53680 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53681 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53682 Cmov = StoredVal.getOperand(0);
53683 else
53684 return SDValue();
53685
53686 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53687 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53688 return SDValue();
53689
53690 bool InvertCC = false;
53691 SDValue V = SDValue(Ld, 0);
53692 if (V == Cmov.getOperand(1))
53693 InvertCC = true;
53694 else if (V != Cmov.getOperand(0))
53695 return SDValue();
53696
53697 SDVTList Tys = DAG.getVTList(MVT::Other);
53698 SDValue CC = Cmov.getOperand(2);
53699 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53700 if (InvertCC)
53701 CC = DAG.getTargetConstant(
53704 dl, MVT::i8);
53705 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53706 Cmov.getOperand(3)};
53707 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53708 St->getMemOperand());
53709 }
53710
53711 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53712 // the FP state in cases where an emms may be missing.
53713 // A preferable solution to the general problem is to figure out the right
53714 // places to insert EMMS. This qualifies as a quick hack.
53715
53716 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53717 if (VT.getSizeInBits() != 64)
53718 return SDValue();
53719
53720 const Function &F = DAG.getMachineFunction().getFunction();
53721 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53722 bool F64IsLegal =
53723 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53724
53725 if (!F64IsLegal || Subtarget.is64Bit())
53726 return SDValue();
53727
53728 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53729 cast<LoadSDNode>(St->getValue())->isSimple() &&
53730 St->getChain().hasOneUse() && St->isSimple()) {
53731 auto *Ld = cast<LoadSDNode>(St->getValue());
53732
53733 if (!ISD::isNormalLoad(Ld))
53734 return SDValue();
53735
53736 // Avoid the transformation if there are multiple uses of the loaded value.
53737 if (!Ld->hasNUsesOfValue(1, 0))
53738 return SDValue();
53739
53740 SDLoc LdDL(Ld);
53741 SDLoc StDL(N);
53742
53743 // Remove any range metadata as we're converting to f64 load/store.
53744 Ld->getMemOperand()->clearRanges();
53745
53746 // Lower to a single movq load/store pair.
53747 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53748 Ld->getBasePtr(), Ld->getMemOperand());
53749
53750 // Make sure new load is placed in same chain order.
53751 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53752 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53753 St->getMemOperand());
53754 }
53755
53756 // This is similar to the above case, but here we handle a scalar 64-bit
53757 // integer store that is extracted from a vector on a 32-bit target.
53758 // If we have SSE2, then we can treat it like a floating-point double
53759 // to get past legalization. The execution dependencies fixup pass will
53760 // choose the optimal machine instruction for the store if this really is
53761 // an integer or v2f32 rather than an f64.
53762 if (VT == MVT::i64 &&
53764 SDValue OldExtract = St->getOperand(1);
53765 SDValue ExtOp0 = OldExtract.getOperand(0);
53766 unsigned VecSize = ExtOp0.getValueSizeInBits();
53767 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53768 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53769 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53770 BitCast, OldExtract.getOperand(1));
53771 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53772 St->getPointerInfo(), St->getBaseAlign(),
53773 St->getMemOperand()->getFlags());
53774 }
53775
53776 return SDValue();
53777}
53778
53781 const X86Subtarget &Subtarget) {
53782 auto *St = cast<MemIntrinsicSDNode>(N);
53783
53784 SDValue StoredVal = N->getOperand(1);
53785 MVT VT = StoredVal.getSimpleValueType();
53786 EVT MemVT = St->getMemoryVT();
53787
53788 // Figure out which elements we demand.
53789 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53790 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53791
53792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53793 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53794 if (N->getOpcode() != ISD::DELETED_NODE)
53795 DCI.AddToWorklist(N);
53796 return SDValue(N, 0);
53797 }
53798
53799 return SDValue();
53800}
53801
53802/// Return 'true' if this vector operation is "horizontal"
53803/// and return the operands for the horizontal operation in LHS and RHS. A
53804/// horizontal operation performs the binary operation on successive elements
53805/// of its first operand, then on successive elements of its second operand,
53806/// returning the resulting values in a vector. For example, if
53807/// A = < float a0, float a1, float a2, float a3 >
53808/// and
53809/// B = < float b0, float b1, float b2, float b3 >
53810/// then the result of doing a horizontal operation on A and B is
53811/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53812/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53813/// A horizontal-op B, for some already available A and B, and if so then LHS is
53814/// set to A, RHS to B, and the routine returns 'true'.
53815static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53816 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53817 bool IsCommutative,
53818 SmallVectorImpl<int> &PostShuffleMask,
53819 bool ForceHorizOp) {
53820 // If either operand is undef, bail out. The binop should be simplified.
53821 if (LHS.isUndef() || RHS.isUndef())
53822 return false;
53823
53824 // Look for the following pattern:
53825 // A = < float a0, float a1, float a2, float a3 >
53826 // B = < float b0, float b1, float b2, float b3 >
53827 // and
53828 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53829 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53830 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53831 // which is A horizontal-op B.
53832
53833 MVT VT = LHS.getSimpleValueType();
53834 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53835 "Unsupported vector type for horizontal add/sub");
53836 unsigned NumElts = VT.getVectorNumElements();
53837
53838 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53839 SmallVectorImpl<int> &ShuffleMask) {
53840 bool UseSubVector = false;
53841 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53842 Op.getOperand(0).getValueType().is256BitVector() &&
53843 llvm::isNullConstant(Op.getOperand(1))) {
53844 Op = Op.getOperand(0);
53845 UseSubVector = true;
53846 }
53848 SmallVector<int, 16> SrcMask, ScaledMask;
53850 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53851 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53852 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53853 })) {
53854 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53855 if (!UseSubVector && SrcOps.size() <= 2 &&
53856 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53857 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53858 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53859 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53860 }
53861 if (UseSubVector && SrcOps.size() == 1 &&
53862 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53863 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53864 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53865 ShuffleMask.assign(Mask.begin(), Mask.end());
53866 }
53867 }
53868 };
53869
53870 // View LHS in the form
53871 // LHS = VECTOR_SHUFFLE A, B, LMask
53872 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53873 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53874 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53875 SDValue A, B;
53877 GetShuffle(LHS, A, B, LMask);
53878
53879 // Likewise, view RHS in the form
53880 // RHS = VECTOR_SHUFFLE C, D, RMask
53881 SDValue C, D;
53883 GetShuffle(RHS, C, D, RMask);
53884
53885 // At least one of the operands should be a vector shuffle.
53886 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53887 if (NumShuffles == 0)
53888 return false;
53889
53890 if (LMask.empty()) {
53891 A = LHS;
53892 for (unsigned i = 0; i != NumElts; ++i)
53893 LMask.push_back(i);
53894 }
53895
53896 if (RMask.empty()) {
53897 C = RHS;
53898 for (unsigned i = 0; i != NumElts; ++i)
53899 RMask.push_back(i);
53900 }
53901
53902 // If we have an unary mask, ensure the other op is set to null.
53903 if (isUndefOrInRange(LMask, 0, NumElts))
53904 B = SDValue();
53905 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53906 A = SDValue();
53907
53908 if (isUndefOrInRange(RMask, 0, NumElts))
53909 D = SDValue();
53910 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53911 C = SDValue();
53912
53913 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53914 // RHS operands and shuffle mask.
53915 if (A != C) {
53916 std::swap(C, D);
53918 }
53919 // Check that the shuffles are both shuffling the same vectors.
53920 if (!(A == C && B == D))
53921 return false;
53922
53923 PostShuffleMask.clear();
53924 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53925
53926 // LHS and RHS are now:
53927 // LHS = shuffle A, B, LMask
53928 // RHS = shuffle A, B, RMask
53929 // Check that the masks correspond to performing a horizontal operation.
53930 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53931 // so we just repeat the inner loop if this is a 256-bit op.
53932 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53933 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53934 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53935 assert((NumEltsPer128BitChunk % 2 == 0) &&
53936 "Vector type should have an even number of elements in each lane");
53937 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53938 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53939 // Ignore undefined components.
53940 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53941 if (LIdx < 0 || RIdx < 0 ||
53942 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53943 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53944 continue;
53945
53946 // Check that successive odd/even elements are being operated on. If not,
53947 // this is not a horizontal operation.
53948 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53949 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53950 return false;
53951
53952 // Compute the post-shuffle mask index based on where the element
53953 // is stored in the HOP result, and where it needs to be moved to.
53954 int Base = LIdx & ~1u;
53955 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53956 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53957
53958 // The low half of the 128-bit result must choose from A.
53959 // The high half of the 128-bit result must choose from B,
53960 // unless B is undef. In that case, we are always choosing from A.
53961 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53962 Index += NumEltsPer64BitChunk;
53963 PostShuffleMask[i + j] = Index;
53964 }
53965 }
53966
53967 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53968 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53969
53970 bool IsIdentityPostShuffle =
53971 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53972 if (IsIdentityPostShuffle)
53973 PostShuffleMask.clear();
53974
53975 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53976 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53977 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53978 return false;
53979
53980 // If the source nodes are already used in HorizOps then always accept this.
53981 // Shuffle folding should merge these back together.
53982 auto FoundHorizUser = [&](SDNode *User) {
53983 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53984 };
53985 ForceHorizOp =
53986 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53987 llvm::any_of(NewRHS->users(), FoundHorizUser));
53988
53989 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53990 // shuffle the result.
53991 if (!ForceHorizOp &&
53992 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53993 (NumShuffles < 2 || !IsIdentityPostShuffle),
53994 DAG, Subtarget))
53995 return false;
53996
53997 LHS = DAG.getBitcast(VT, NewLHS);
53998 RHS = DAG.getBitcast(VT, NewRHS);
53999 return true;
54000}
54001
54002// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54004 const X86Subtarget &Subtarget) {
54005 EVT VT = N->getValueType(0);
54006 unsigned Opcode = N->getOpcode();
54007 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54008 SmallVector<int, 8> PostShuffleMask;
54009
54010 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54011 return N->hasOneUse() &&
54012 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54013 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54014 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54015 };
54016
54017 switch (Opcode) {
54018 case ISD::FADD:
54019 case ISD::FSUB:
54020 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54021 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54022 SDValue LHS = N->getOperand(0);
54023 SDValue RHS = N->getOperand(1);
54024 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54025 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54026 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54027 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54028 if (!PostShuffleMask.empty())
54029 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54030 DAG.getUNDEF(VT), PostShuffleMask);
54031 return HorizBinOp;
54032 }
54033 }
54034 break;
54035 case ISD::ADD:
54036 case ISD::SUB:
54037 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54038 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54039 SDValue LHS = N->getOperand(0);
54040 SDValue RHS = N->getOperand(1);
54041 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54042 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54043 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54044 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54046 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54047 };
54048 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54049 {LHS, RHS}, HOpBuilder);
54050 if (!PostShuffleMask.empty())
54051 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54052 DAG.getUNDEF(VT), PostShuffleMask);
54053 return HorizBinOp;
54054 }
54055 }
54056 break;
54057 }
54058
54059 return SDValue();
54060}
54061
54062// Try to combine the following nodes
54063// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54064// <i32 -2147483648[float -0.000000e+00]> 0
54065// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54066// <(load 4 from constant-pool)> t0, t29
54067// [t30: v16i32 = bitcast t27]
54068// t6: v16i32 = xor t7, t27[t30]
54069// t11: v16f32 = bitcast t6
54070// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54071// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54072// t22: v16f32 = bitcast t7
54073// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54074// t24: v32f16 = bitcast t23
54076 const X86Subtarget &Subtarget) {
54077 EVT VT = N->getValueType(0);
54078 SDValue LHS = N->getOperand(0);
54079 SDValue RHS = N->getOperand(1);
54080 int CombineOpcode =
54081 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54082 auto combineConjugation = [&](SDValue &r) {
54083 if (LHS->getOpcode() == ISD::BITCAST) {
54084 SDValue XOR = LHS.getOperand(0);
54085 if (XOR->getOpcode() == ISD::XOR) {
54086 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54087 if (XORRHS.isConstant()) {
54088 APInt ConjugationInt32 = APInt(32, 0x80000000);
54089 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54090 if ((XORRHS.getBitWidth() == 32 &&
54091 XORRHS.getConstant() == ConjugationInt32) ||
54092 (XORRHS.getBitWidth() == 64 &&
54093 XORRHS.getConstant() == ConjugationInt64)) {
54094 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54095 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54096 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54097 r = DAG.getBitcast(VT, FCMulC);
54098 return true;
54099 }
54100 }
54101 }
54102 }
54103 return false;
54104 };
54105 SDValue Res;
54106 if (combineConjugation(Res))
54107 return Res;
54108 std::swap(LHS, RHS);
54109 if (combineConjugation(Res))
54110 return Res;
54111 return Res;
54112}
54113
54114// Try to combine the following nodes:
54115// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54117 const X86Subtarget &Subtarget) {
54118 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54120 Flags.hasAllowContract();
54121 };
54122
54123 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54124 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54125 Flags.hasNoSignedZeros();
54126 };
54127 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54128 APInt AI = APInt(32, 0x80008000);
54129 KnownBits Bits = DAG.computeKnownBits(Op);
54130 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54131 Bits.getConstant() == AI;
54132 };
54133
54134 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54135 !AllowContract(N->getFlags()))
54136 return SDValue();
54137
54138 EVT VT = N->getValueType(0);
54139 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54140 return SDValue();
54141
54142 SDValue LHS = N->getOperand(0);
54143 SDValue RHS = N->getOperand(1);
54144 bool IsConj;
54145 SDValue FAddOp1, MulOp0, MulOp1;
54146 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54147 &IsVectorAllNegativeZero,
54148 &HasNoSignedZero](SDValue N) -> bool {
54149 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54150 return false;
54151 SDValue Op0 = N.getOperand(0);
54152 unsigned Opcode = Op0.getOpcode();
54153 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54154 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54155 MulOp0 = Op0.getOperand(0);
54156 MulOp1 = Op0.getOperand(1);
54157 IsConj = Opcode == X86ISD::VFCMULC;
54158 return true;
54159 }
54160 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54162 HasNoSignedZero(Op0->getFlags())) ||
54163 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54164 MulOp0 = Op0.getOperand(0);
54165 MulOp1 = Op0.getOperand(1);
54166 IsConj = Opcode == X86ISD::VFCMADDC;
54167 return true;
54168 }
54169 }
54170 return false;
54171 };
54172
54173 if (GetCFmulFrom(LHS))
54174 FAddOp1 = RHS;
54175 else if (GetCFmulFrom(RHS))
54176 FAddOp1 = LHS;
54177 else
54178 return SDValue();
54179
54180 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54181 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54182 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54183 // FIXME: How do we handle when fast math flags of FADD are different from
54184 // CFMUL's?
54185 SDValue CFmul =
54186 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54187 return DAG.getBitcast(VT, CFmul);
54188}
54189
54190/// Do target-specific dag combines on floating-point adds/subs.
54192 const X86Subtarget &Subtarget) {
54193 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54194 return HOp;
54195
54196 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54197 return COp;
54198
54199 return SDValue();
54200}
54201
54203 const X86Subtarget &Subtarget) {
54204 EVT VT = N->getValueType(0);
54205 SDValue Src = N->getOperand(0);
54206 EVT SrcVT = Src.getValueType();
54207 SDLoc DL(N);
54208
54209 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54210
54211 // Let legalize expand this if it isn't a legal type yet.
54212 if (!TLI.isTypeLegal(VT))
54213 return SDValue();
54214
54215 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54216 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54217 return SDValue();
54218
54219 if (SrcVT == MVT::v2f16) {
54220 SrcVT = MVT::v4f16;
54221 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54222 DAG.getUNDEF(MVT::v2f16));
54223 }
54224
54225 if (SrcVT == MVT::v4f16) {
54226 SrcVT = MVT::v8f16;
54227 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54228 DAG.getUNDEF(MVT::v4f16));
54229 } else if (SrcVT == MVT::v2f32) {
54230 SrcVT = MVT::v4f32;
54231 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54232 DAG.getUNDEF(MVT::v2f32));
54233 } else {
54234 return SDValue();
54235 }
54236
54237 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54238}
54239
54240// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54241// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54242// are able to avoid generating code with MOVABS and large constants in certain
54243// cases.
54245 const SDLoc &DL) {
54246 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54247 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54248 if (!ValidSrlConst)
54249 return SDValue();
54250 unsigned SrlConstVal = *ValidSrlConst;
54251
54252 SDValue Op = N.getOperand(0);
54253 unsigned Opcode = Op.getOpcode();
54254 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54255 "Illegal truncation types");
54256
54257 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54258 !isa<ConstantSDNode>(Op.getOperand(1)))
54259 return SDValue();
54260 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54261
54262 if (SrlConstVal <= 32 ||
54263 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54264 return SDValue();
54265
54266 SDValue OpLhsSrl =
54267 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54268 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54269
54270 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54271 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54272 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54273
54274 if (Opcode == ISD::ADD) {
54275 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54276 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54277 }
54278 return NewOpNode;
54279}
54280
54281/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54282/// the codegen.
54283/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54284/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54285/// anything that is guaranteed to be transformed by DAGCombiner.
54287 const X86Subtarget &Subtarget,
54288 const SDLoc &DL) {
54289 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54290 SDValue Src = N->getOperand(0);
54291 unsigned SrcOpcode = Src.getOpcode();
54292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54293
54294 EVT VT = N->getValueType(0);
54295 EVT SrcVT = Src.getValueType();
54296
54297 auto IsFreeTruncation = [VT](SDValue Op) {
54298 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54299
54300 // See if this has been extended from a smaller/equal size to
54301 // the truncation size, allowing a truncation to combine with the extend.
54302 unsigned Opcode = Op.getOpcode();
54303 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54304 Opcode == ISD::ZERO_EXTEND) &&
54305 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54306 return true;
54307
54308 // See if this is a single use constant which can be constant folded.
54309 // NOTE: We don't peek throught bitcasts here because there is currently
54310 // no support for constant folding truncate+bitcast+vector_of_constants. So
54311 // we'll just send up with a truncate on both operands which will
54312 // get turned back into (truncate (binop)) causing an infinite loop.
54313 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54314 };
54315
54316 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54317 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54318 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54319 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54320 };
54321
54322 // Don't combine if the operation has other uses.
54323 if (!Src.hasOneUse())
54324 return SDValue();
54325
54326 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54327 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54328
54329 if (!VT.isVector())
54330 return SDValue();
54331
54332 // In most cases its only worth pre-truncating if we're only facing the cost
54333 // of one truncation.
54334 // i.e. if one of the inputs will constant fold or the input is repeated.
54335 switch (SrcOpcode) {
54336 case ISD::MUL:
54337 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54338 // better to truncate if we have the chance.
54339 if (SrcVT.getScalarType() == MVT::i64 &&
54340 TLI.isOperationLegal(SrcOpcode, VT) &&
54341 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54342 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54343 [[fallthrough]];
54344 case ISD::AND:
54345 case ISD::XOR:
54346 case ISD::OR:
54347 case ISD::ADD:
54348 case ISD::SUB: {
54349 SDValue Op0 = Src.getOperand(0);
54350 SDValue Op1 = Src.getOperand(1);
54351 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54352 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54353 return TruncateArithmetic(Op0, Op1);
54354 break;
54355 }
54356 }
54357
54358 return SDValue();
54359}
54360
54361// Try to form a MULHU or MULHS node by looking for
54362// (trunc (srl (mul ext, ext), >= 16))
54363// TODO: This is X86 specific because we want to be able to handle wide types
54364// before type legalization. But we can only do it if the vector will be
54365// legalized via widening/splitting. Type legalization can't handle promotion
54366// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54367// combiner.
54368static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54369 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54370 using namespace llvm::SDPatternMatch;
54371
54372 if (!Subtarget.hasSSE2())
54373 return SDValue();
54374
54375 // Only handle vXi16 types that are at least 128-bits unless they will be
54376 // widened.
54377 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54378 return SDValue();
54379
54380 // Input type should be at least vXi32.
54381 EVT InVT = Src.getValueType();
54382 if (InVT.getVectorElementType().getSizeInBits() < 32)
54383 return SDValue();
54384
54385 // First instruction should be a right shift by 16 of a multiply.
54386 SDValue LHS, RHS;
54387 APInt ShiftAmt;
54388 if (!sd_match(Src,
54389 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54390 return SDValue();
54391
54392 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54393 return SDValue();
54394
54395 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54396
54397 // Count leading sign/zero bits on both inputs - if there are enough then
54398 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54399 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54400 // truncations may actually be free by peeking through to the ext source.
54401 auto IsSext = [&DAG](SDValue V) {
54402 return DAG.ComputeMaxSignificantBits(V) <= 16;
54403 };
54404 auto IsZext = [&DAG](SDValue V) {
54405 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54406 };
54407
54408 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54409 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54410 if (!IsSigned && !IsUnsigned)
54411 return SDValue();
54412
54413 // Check if both inputs are extensions, which will be removed by truncation.
54414 auto isOpTruncateFree = [](SDValue Op) {
54415 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54416 Op.getOpcode() == ISD::ZERO_EXTEND)
54417 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54418 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54419 };
54420 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54421
54422 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54423 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54424 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54425 // will have to split anyway.
54426 unsigned InSizeInBits = InVT.getSizeInBits();
54427 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54428 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54429 (InSizeInBits % 16) == 0) {
54430 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54431 InVT.getSizeInBits() / 16);
54432 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54433 DAG.getBitcast(BCVT, RHS));
54434 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54435 return DAG.getNode(ISD::SRL, DL, VT, Res,
54436 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54437 }
54438
54439 // Truncate back to source type.
54440 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54441 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54442
54443 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54444 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54445 return DAG.getNode(ISD::SRL, DL, VT, Res,
54446 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54447}
54448
54449// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54450// from one vector with signed bytes from another vector, adds together
54451// adjacent pairs of 16-bit products, and saturates the result before
54452// truncating to 16-bits.
54453//
54454// Which looks something like this:
54455// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54456// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54458 const X86Subtarget &Subtarget,
54459 const SDLoc &DL) {
54460 if (!VT.isVector() || !Subtarget.hasSSSE3())
54461 return SDValue();
54462
54463 unsigned NumElems = VT.getVectorNumElements();
54464 EVT ScalarVT = VT.getVectorElementType();
54465 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54466 return SDValue();
54467
54468 SDValue SSatVal = detectSSatPattern(In, VT);
54469 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54470 return SDValue();
54471
54472 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54473 // of multiplies from even/odd elements.
54474 SDValue N0 = SSatVal.getOperand(0);
54475 SDValue N1 = SSatVal.getOperand(1);
54476
54477 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54478 return SDValue();
54479
54480 SDValue N00 = N0.getOperand(0);
54481 SDValue N01 = N0.getOperand(1);
54482 SDValue N10 = N1.getOperand(0);
54483 SDValue N11 = N1.getOperand(1);
54484
54485 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54486 // Canonicalize zero_extend to LHS.
54487 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54488 std::swap(N00, N01);
54489 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54490 std::swap(N10, N11);
54491
54492 // Ensure we have a zero_extend and a sign_extend.
54493 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54494 N01.getOpcode() != ISD::SIGN_EXTEND ||
54495 N10.getOpcode() != ISD::ZERO_EXTEND ||
54496 N11.getOpcode() != ISD::SIGN_EXTEND)
54497 return SDValue();
54498
54499 // Peek through the extends.
54500 N00 = N00.getOperand(0);
54501 N01 = N01.getOperand(0);
54502 N10 = N10.getOperand(0);
54503 N11 = N11.getOperand(0);
54504
54505 // Ensure the extend is from vXi8.
54506 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54507 N01.getValueType().getVectorElementType() != MVT::i8 ||
54508 N10.getValueType().getVectorElementType() != MVT::i8 ||
54509 N11.getValueType().getVectorElementType() != MVT::i8)
54510 return SDValue();
54511
54512 // All inputs should be build_vectors.
54513 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54514 N01.getOpcode() != ISD::BUILD_VECTOR ||
54515 N10.getOpcode() != ISD::BUILD_VECTOR ||
54517 return SDValue();
54518
54519 // N00/N10 are zero extended. N01/N11 are sign extended.
54520
54521 // For each element, we need to ensure we have an odd element from one vector
54522 // multiplied by the odd element of another vector and the even element from
54523 // one of the same vectors being multiplied by the even element from the
54524 // other vector. So we need to make sure for each element i, this operator
54525 // is being performed:
54526 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54527 SDValue ZExtIn, SExtIn;
54528 for (unsigned i = 0; i != NumElems; ++i) {
54529 SDValue N00Elt = N00.getOperand(i);
54530 SDValue N01Elt = N01.getOperand(i);
54531 SDValue N10Elt = N10.getOperand(i);
54532 SDValue N11Elt = N11.getOperand(i);
54533 // TODO: Be more tolerant to undefs.
54534 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54535 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54536 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54538 return SDValue();
54539 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54540 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54541 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54542 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54543 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54544 return SDValue();
54545 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54546 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54547 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54548 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54549 // Add is commutative so indices can be reordered.
54550 if (IdxN00 > IdxN10) {
54551 std::swap(IdxN00, IdxN10);
54552 std::swap(IdxN01, IdxN11);
54553 }
54554 // N0 indices be the even element. N1 indices must be the next odd element.
54555 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54556 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54557 return SDValue();
54558 SDValue N00In = N00Elt.getOperand(0);
54559 SDValue N01In = N01Elt.getOperand(0);
54560 SDValue N10In = N10Elt.getOperand(0);
54561 SDValue N11In = N11Elt.getOperand(0);
54562 // First time we find an input capture it.
54563 if (!ZExtIn) {
54564 ZExtIn = N00In;
54565 SExtIn = N01In;
54566 }
54567 if (ZExtIn != N00In || SExtIn != N01In ||
54568 ZExtIn != N10In || SExtIn != N11In)
54569 return SDValue();
54570 }
54571
54572 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54573 EVT ExtVT = Ext.getValueType();
54574 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54575 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54576 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54577 DAG.getVectorIdxConstant(0, DL));
54578 }
54579 };
54580 ExtractVec(ZExtIn);
54581 ExtractVec(SExtIn);
54582
54583 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54585 // Shrink by adding truncate nodes and let DAGCombine fold with the
54586 // sources.
54587 EVT InVT = Ops[0].getValueType();
54588 assert(InVT.getScalarType() == MVT::i8 &&
54589 "Unexpected scalar element type");
54590 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54591 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54592 InVT.getVectorNumElements() / 2);
54593 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54594 };
54595 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54596 PMADDBuilder);
54597}
54598
54600 const X86Subtarget &Subtarget) {
54601 EVT VT = N->getValueType(0);
54602 SDValue Src = N->getOperand(0);
54603 SDLoc DL(N);
54604
54605 // Attempt to pre-truncate inputs to arithmetic ops instead.
54606 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54607 return V;
54608
54609 // Try to detect PMADD
54610 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54611 return PMAdd;
54612
54613 // Try to combine truncation with signed/unsigned saturation.
54614 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54615 return Val;
54616
54617 // Try to combine PMULHUW/PMULHW for vXi16.
54618 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54619 return V;
54620
54621 // The bitcast source is a direct mmx result.
54622 // Detect bitcasts between i32 to x86mmx
54623 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54624 SDValue BCSrc = Src.getOperand(0);
54625 if (BCSrc.getValueType() == MVT::x86mmx)
54626 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54627 }
54628
54629 return SDValue();
54630}
54631
54634 EVT VT = N->getValueType(0);
54635 SDValue In = N->getOperand(0);
54636 SDLoc DL(N);
54637
54638 if (SDValue SSatVal = detectSSatPattern(In, VT))
54639 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54640 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54641 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54642
54643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54644 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54645 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54646 return SDValue(N, 0);
54647
54648 return SDValue();
54649}
54650
54651/// Returns the negated value if the node \p N flips sign of FP value.
54652///
54653/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54654/// or FSUB(0, x)
54655/// AVX512F does not have FXOR, so FNEG is lowered as
54656/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54657/// In this case we go though all bitcasts.
54658/// This also recognizes splat of a negated value and returns the splat of that
54659/// value.
54660static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54661 if (N->getOpcode() == ISD::FNEG)
54662 return N->getOperand(0);
54663
54664 // Don't recurse exponentially.
54666 return SDValue();
54667
54668 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54669
54671 EVT VT = Op->getValueType(0);
54672
54673 // Make sure the element size doesn't change.
54674 if (VT.getScalarSizeInBits() != ScalarSize)
54675 return SDValue();
54676
54677 unsigned Opc = Op.getOpcode();
54678 switch (Opc) {
54679 case ISD::VECTOR_SHUFFLE: {
54680 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54681 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54682 if (!Op.getOperand(1).isUndef())
54683 return SDValue();
54684 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54685 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54686 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54687 cast<ShuffleVectorSDNode>(Op)->getMask());
54688 break;
54689 }
54691 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54692 // -V, INDEX).
54693 SDValue InsVector = Op.getOperand(0);
54694 SDValue InsVal = Op.getOperand(1);
54695 if (!InsVector.isUndef())
54696 return SDValue();
54697 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54698 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54699 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54700 NegInsVal, Op.getOperand(2));
54701 break;
54702 }
54703 case ISD::FSUB:
54704 case ISD::XOR:
54705 case X86ISD::FXOR: {
54706 SDValue Op1 = Op.getOperand(1);
54707 SDValue Op0 = Op.getOperand(0);
54708
54709 // For XOR and FXOR, we want to check if constant
54710 // bits of Op1 are sign bit masks. For FSUB, we
54711 // have to check if constant bits of Op0 are sign
54712 // bit masks and hence we swap the operands.
54713 if (Opc == ISD::FSUB)
54714 std::swap(Op0, Op1);
54715
54716 APInt UndefElts;
54717 SmallVector<APInt, 16> EltBits;
54718 // Extract constant bits and see if they are all
54719 // sign bit masks. Ignore the undef elements.
54720 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54721 /* AllowWholeUndefs */ true,
54722 /* AllowPartialUndefs */ false)) {
54723 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54724 if (!UndefElts[I] && !EltBits[I].isSignMask())
54725 return SDValue();
54726
54727 // Only allow bitcast from correctly-sized constant.
54728 Op0 = peekThroughBitcasts(Op0);
54729 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54730 return Op0;
54731 }
54732 break;
54733 } // case
54734 } // switch
54735
54736 return SDValue();
54737}
54738
54739static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54740 bool NegRes) {
54741 if (NegMul) {
54742 switch (Opcode) {
54743 // clang-format off
54744 default: llvm_unreachable("Unexpected opcode");
54745 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54746 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54747 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54748 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54749 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54750 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54751 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54752 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54753 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54754 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54755 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54756 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54757 // clang-format on
54758 }
54759 }
54760
54761 if (NegAcc) {
54762 switch (Opcode) {
54763 // clang-format off
54764 default: llvm_unreachable("Unexpected opcode");
54765 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54766 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54767 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54768 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54769 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54770 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54771 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54772 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54773 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54774 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54775 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54776 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54777 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54778 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54779 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54780 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54781 // clang-format on
54782 }
54783 }
54784
54785 if (NegRes) {
54786 switch (Opcode) {
54787 // For accuracy reason, we never combine fneg and fma under strict FP.
54788 // clang-format off
54789 default: llvm_unreachable("Unexpected opcode");
54790 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54791 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54792 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54793 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54794 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54795 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54796 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54797 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54798 // clang-format on
54799 }
54800 }
54801
54802 return Opcode;
54803}
54804
54805/// Do target-specific dag combines on floating point negations.
54808 const X86Subtarget &Subtarget) {
54809 EVT OrigVT = N->getValueType(0);
54810 SDValue Arg = isFNEG(DAG, N);
54811 if (!Arg)
54812 return SDValue();
54813
54814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54815 EVT VT = Arg.getValueType();
54816 EVT SVT = VT.getScalarType();
54817 SDLoc DL(N);
54818
54819 // Let legalize expand this if it isn't a legal type yet.
54820 if (!TLI.isTypeLegal(VT))
54821 return SDValue();
54822
54823 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54824 // use of a constant by performing (-0 - A*B) instead.
54825 // FIXME: Check rounding control flags as well once it becomes available.
54826 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54827 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54828 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54829 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54830 Arg.getOperand(1), Zero);
54831 return DAG.getBitcast(OrigVT, NewNode);
54832 }
54833
54835 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54836 if (SDValue NegArg =
54837 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54838 return DAG.getBitcast(OrigVT, NegArg);
54839
54840 return SDValue();
54841}
54842
54844 bool LegalOperations,
54845 bool ForCodeSize,
54847 unsigned Depth) const {
54848 // fneg patterns are removable even if they have multiple uses.
54849 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54851 return DAG.getBitcast(Op.getValueType(), Arg);
54852 }
54853
54854 EVT VT = Op.getValueType();
54855 EVT SVT = VT.getScalarType();
54856 unsigned Opc = Op.getOpcode();
54857 SDNodeFlags Flags = Op.getNode()->getFlags();
54858 switch (Opc) {
54859 case ISD::FMA:
54860 case X86ISD::FMSUB:
54861 case X86ISD::FNMADD:
54862 case X86ISD::FNMSUB:
54863 case X86ISD::FMADD_RND:
54864 case X86ISD::FMSUB_RND:
54865 case X86ISD::FNMADD_RND:
54866 case X86ISD::FNMSUB_RND: {
54867 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54868 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54870 break;
54871
54872 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54873 // if it may have signed zeros.
54874 if (!Flags.hasNoSignedZeros())
54875 break;
54876
54877 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54878 // keep temporary nodes alive.
54879 std::list<HandleSDNode> Handles;
54880
54881 // This is always negatible for free but we might be able to remove some
54882 // extra operand negations as well.
54883 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54884 for (int i = 0; i != 3; ++i) {
54885 NewOps[i] = getCheaperNegatedExpression(
54886 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54887 if (!!NewOps[i])
54888 Handles.emplace_back(NewOps[i]);
54889 }
54890
54891 bool NegA = !!NewOps[0];
54892 bool NegB = !!NewOps[1];
54893 bool NegC = !!NewOps[2];
54894 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54895
54896 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54898
54899 // Fill in the non-negated ops with the original values.
54900 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54901 if (!NewOps[i])
54902 NewOps[i] = Op.getOperand(i);
54903 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54904 }
54905 case X86ISD::FRCP:
54906 if (SDValue NegOp0 =
54907 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54908 ForCodeSize, Cost, Depth + 1))
54909 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54910 break;
54911 }
54912
54913 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54914 ForCodeSize, Cost, Depth);
54915}
54916
54918 const X86Subtarget &Subtarget) {
54919 MVT VT = N->getSimpleValueType(0);
54920 // If we have integer vector types available, use the integer opcodes.
54921 if (!VT.isVector() || !Subtarget.hasSSE2())
54922 return SDValue();
54923
54924 SDLoc dl(N);
54926 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54927 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54928 unsigned IntOpcode;
54929 switch (N->getOpcode()) {
54930 // clang-format off
54931 default: llvm_unreachable("Unexpected FP logic op");
54932 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54933 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54934 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54935 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54936 // clang-format on
54937 }
54938 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54939 return DAG.getBitcast(VT, IntOp);
54940}
54941
54942/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54944 if (N->getOpcode() != ISD::XOR)
54945 return SDValue();
54946
54947 SDValue LHS = N->getOperand(0);
54948 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54949 return SDValue();
54950
54952 X86::CondCode(LHS->getConstantOperandVal(0)));
54953 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54954}
54955
54957 const X86Subtarget &Subtarget) {
54958 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54959 "Invalid opcode for combing with CTLZ");
54960 if (Subtarget.hasFastLZCNT())
54961 return SDValue();
54962
54963 EVT VT = N->getValueType(0);
54964 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54965 (VT != MVT::i64 || !Subtarget.is64Bit()))
54966 return SDValue();
54967
54968 SDValue N0 = N->getOperand(0);
54969 SDValue N1 = N->getOperand(1);
54970
54971 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54973 return SDValue();
54974
54975 SDValue OpCTLZ;
54976 SDValue OpSizeTM1;
54977
54978 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54979 OpCTLZ = N1;
54980 OpSizeTM1 = N0;
54981 } else if (N->getOpcode() == ISD::SUB) {
54982 return SDValue();
54983 } else {
54984 OpCTLZ = N0;
54985 OpSizeTM1 = N1;
54986 }
54987
54988 if (!OpCTLZ.hasOneUse())
54989 return SDValue();
54990 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54991 if (!C)
54992 return SDValue();
54993
54994 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54995 return SDValue();
54996 EVT OpVT = VT;
54997 SDValue Op = OpCTLZ.getOperand(0);
54998 if (VT == MVT::i8) {
54999 // Zero extend to i32 since there is not an i8 bsr.
55000 OpVT = MVT::i32;
55001 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55002 }
55003
55004 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55005 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55006 if (VT == MVT::i8)
55007 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55008
55009 return Op;
55010}
55011
55014 const X86Subtarget &Subtarget) {
55015 SDValue N0 = N->getOperand(0);
55016 SDValue N1 = N->getOperand(1);
55017 EVT VT = N->getValueType(0);
55018 SDLoc DL(N);
55019
55020 // If this is SSE1 only convert to FXOR to avoid scalarization.
55021 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55022 return DAG.getBitcast(MVT::v4i32,
55023 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55024 DAG.getBitcast(MVT::v4f32, N0),
55025 DAG.getBitcast(MVT::v4f32, N1)));
55026 }
55027
55028 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55029 return Cmp;
55030
55031 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55032 return R;
55033
55034 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55035 return R;
55036
55037 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55038 return R;
55039
55040 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55041 DAG, DCI, Subtarget))
55042 return FPLogic;
55043
55044 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55045 return R;
55046
55047 if (DCI.isBeforeLegalizeOps())
55048 return SDValue();
55049
55050 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55051 return SetCC;
55052
55053 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55054 return R;
55055
55056 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55057 return RV;
55058
55059 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55060 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55061 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55062 N0.getOperand(0).getValueType().isVector() &&
55063 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55064 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55065 return DAG.getBitcast(
55066 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55067 }
55068
55069 // Handle AVX512 mask widening.
55070 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55071 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55072 VT.getVectorElementType() == MVT::i1 &&
55074 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55075 return DAG.getNode(
55077 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55078 N0.getOperand(2));
55079 }
55080
55081 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55082 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55083 // TODO: Under what circumstances could this be performed in DAGCombine?
55084 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55085 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55086 SDValue TruncExtSrc = N0.getOperand(0);
55087 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55088 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55089 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55090 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55091 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55092 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55093 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55094 }
55095 }
55096
55097 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55098 return R;
55099
55100 return combineFneg(N, DAG, DCI, Subtarget);
55101}
55102
55105 const X86Subtarget &Subtarget) {
55106 SDValue N0 = N->getOperand(0);
55107 EVT VT = N->getValueType(0);
55108
55109 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55110 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55111 SDValue Src = N0.getOperand(0);
55112 EVT SrcVT = Src.getValueType();
55113 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55114 (DCI.isBeforeLegalize() ||
55115 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55116 Subtarget.hasSSSE3()) {
55117 unsigned NumElts = SrcVT.getVectorNumElements();
55118 SmallVector<int, 32> ReverseMask(NumElts);
55119 for (unsigned I = 0; I != NumElts; ++I)
55120 ReverseMask[I] = (NumElts - 1) - I;
55121 SDValue Rev =
55122 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55123 return DAG.getBitcast(VT, Rev);
55124 }
55125 }
55126
55127 return SDValue();
55128}
55129
55130// Various combines to try to convert to avgceilu.
55133 const X86Subtarget &Subtarget) {
55134 unsigned Opcode = N->getOpcode();
55135 SDValue N0 = N->getOperand(0);
55136 SDValue N1 = N->getOperand(1);
55137 EVT VT = N->getValueType(0);
55138 EVT SVT = VT.getScalarType();
55139 SDLoc DL(N);
55140
55141 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55142 // Only useful on vXi8 which doesn't have good SRA handling.
55143 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55145 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55146 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55147 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55148 return DAG.getNode(ISD::XOR, DL, VT,
55149 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55150 }
55151
55152 return SDValue();
55153}
55154
55157 const X86Subtarget &Subtarget) {
55158 EVT VT = N->getValueType(0);
55159 unsigned NumBits = VT.getSizeInBits();
55160
55161 // TODO - Constant Folding.
55162
55163 // Simplify the inputs.
55164 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55165 APInt DemandedMask(APInt::getAllOnes(NumBits));
55166 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55167 return SDValue(N, 0);
55168
55169 return SDValue();
55170}
55171
55173 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55174}
55175
55176/// If a value is a scalar FP zero or a vector FP zero (potentially including
55177/// undefined elements), return a zero constant that may be used to fold away
55178/// that value. In the case of a vector, the returned constant will not contain
55179/// undefined elements even if the input parameter does. This makes it suitable
55180/// to be used as a replacement operand with operations (eg, bitwise-and) where
55181/// an undef should not propagate.
55183 const X86Subtarget &Subtarget) {
55185 return SDValue();
55186
55187 if (V.getValueType().isVector())
55188 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55189
55190 return V;
55191}
55192
55194 const X86Subtarget &Subtarget) {
55195 SDValue N0 = N->getOperand(0);
55196 SDValue N1 = N->getOperand(1);
55197 EVT VT = N->getValueType(0);
55198 SDLoc DL(N);
55199
55200 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55201 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55202 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55203 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55204 return SDValue();
55205
55206 auto isAllOnesConstantFP = [](SDValue V) {
55207 if (V.getSimpleValueType().isVector())
55208 return ISD::isBuildVectorAllOnes(V.getNode());
55209 auto *C = dyn_cast<ConstantFPSDNode>(V);
55210 return C && C->getConstantFPValue()->isAllOnesValue();
55211 };
55212
55213 // fand (fxor X, -1), Y --> fandn X, Y
55214 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55215 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55216
55217 // fand X, (fxor Y, -1) --> fandn Y, X
55218 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55219 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55220
55221 return SDValue();
55222}
55223
55224/// Do target-specific dag combines on X86ISD::FAND nodes.
55226 const X86Subtarget &Subtarget) {
55227 // FAND(0.0, x) -> 0.0
55228 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55229 return V;
55230
55231 // FAND(x, 0.0) -> 0.0
55232 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55233 return V;
55234
55235 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55236 return V;
55237
55238 return lowerX86FPLogicOp(N, DAG, Subtarget);
55239}
55240
55241/// Do target-specific dag combines on X86ISD::FANDN nodes.
55243 const X86Subtarget &Subtarget) {
55244 // FANDN(0.0, x) -> x
55245 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55246 return N->getOperand(1);
55247
55248 // FANDN(x, 0.0) -> 0.0
55249 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55250 return V;
55251
55252 return lowerX86FPLogicOp(N, DAG, Subtarget);
55253}
55254
55255/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55258 const X86Subtarget &Subtarget) {
55259 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55260
55261 // F[X]OR(0.0, x) -> x
55262 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55263 return N->getOperand(1);
55264
55265 // F[X]OR(x, 0.0) -> x
55266 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55267 return N->getOperand(0);
55268
55269 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55270 return NewVal;
55271
55272 return lowerX86FPLogicOp(N, DAG, Subtarget);
55273}
55274
55275/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55277 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55278
55279 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55280 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55282 return SDValue();
55283
55284 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55285 // into FMINC and FMAXC, which are Commutative operations.
55286 unsigned NewOp = 0;
55287 switch (N->getOpcode()) {
55288 default: llvm_unreachable("unknown opcode");
55289 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55290 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55291 }
55292
55293 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55294 N->getOperand(0), N->getOperand(1));
55295}
55296
55298 const X86Subtarget &Subtarget) {
55299 EVT VT = N->getValueType(0);
55300 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55301 return SDValue();
55302
55303 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55304
55305 auto IsMinMaxLegal = [&](EVT VT) {
55306 if (!TLI.isTypeLegal(VT))
55307 return false;
55308 return VT.getScalarType() != MVT::f16 ||
55309 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55310 };
55311
55312 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55313 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55314 (Subtarget.hasFP16() && VT == MVT::f16) ||
55315 (VT.isVector() && IsMinMaxLegal(VT))))
55316 return SDValue();
55317
55318 SDValue Op0 = N->getOperand(0);
55319 SDValue Op1 = N->getOperand(1);
55320 SDLoc DL(N);
55321 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55322
55323 // If we don't have to respect NaN inputs, this is a direct translation to x86
55324 // min/max instructions.
55325 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55326 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55327
55328 // If one of the operands is known non-NaN use the native min/max instructions
55329 // with the non-NaN input as second operand.
55330 if (DAG.isKnownNeverNaN(Op1))
55331 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55332 if (DAG.isKnownNeverNaN(Op0))
55333 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55334
55335 // If we have to respect NaN inputs, this takes at least 3 instructions.
55336 // Favor a library call when operating on a scalar and minimizing code size.
55337 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55338 return SDValue();
55339
55340 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55341 VT);
55342
55343 // There are 4 possibilities involving NaN inputs, and these are the required
55344 // outputs:
55345 // Op1
55346 // Num NaN
55347 // ----------------
55348 // Num | Max | Op0 |
55349 // Op0 ----------------
55350 // NaN | Op1 | NaN |
55351 // ----------------
55352 //
55353 // The SSE FP max/min instructions were not designed for this case, but rather
55354 // to implement:
55355 // Min = Op1 < Op0 ? Op1 : Op0
55356 // Max = Op1 > Op0 ? Op1 : Op0
55357 //
55358 // So they always return Op0 if either input is a NaN. However, we can still
55359 // use those instructions for fmaxnum by selecting away a NaN input.
55360
55361 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55362 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55363 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55364
55365 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55366 // are NaN, the NaN value of Op1 is the result.
55367 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55368}
55369
55372 EVT VT = N->getValueType(0);
55373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55374
55375 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55376 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55377 return SDValue(N, 0);
55378
55379 // Convert a full vector load into vzload when not all bits are needed.
55380 SDValue In = N->getOperand(0);
55381 MVT InVT = In.getSimpleValueType();
55382 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55383 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55384 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55385 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55386 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55387 MVT MemVT = MVT::getIntegerVT(NumBits);
55388 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55389 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55390 SDLoc dl(N);
55391 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55392 DAG.getBitcast(InVT, VZLoad));
55393 DCI.CombineTo(N, Convert);
55394 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55396 return SDValue(N, 0);
55397 }
55398 }
55399
55400 return SDValue();
55401}
55402
55406 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55407 EVT VT = N->getValueType(0);
55408
55409 // Convert a full vector load into vzload when not all bits are needed.
55410 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55411 MVT InVT = In.getSimpleValueType();
55412 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55413 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55414 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55415 LoadSDNode *LN = cast<LoadSDNode>(In);
55416 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55417 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55418 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55419 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55420 SDLoc dl(N);
55421 if (IsStrict) {
55422 SDValue Convert =
55423 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55424 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55425 DCI.CombineTo(N, Convert, Convert.getValue(1));
55426 } else {
55427 SDValue Convert =
55428 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55429 DCI.CombineTo(N, Convert);
55430 }
55431 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55433 return SDValue(N, 0);
55434 }
55435 }
55436
55437 return SDValue();
55438}
55439
55440/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55443 const X86Subtarget &Subtarget) {
55444 SDValue N0 = N->getOperand(0);
55445 SDValue N1 = N->getOperand(1);
55446 MVT VT = N->getSimpleValueType(0);
55447 int NumElts = VT.getVectorNumElements();
55448 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55449 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55450 SDLoc DL(N);
55451
55452 // ANDNP(undef, x) -> 0
55453 // ANDNP(x, undef) -> 0
55454 if (N0.isUndef() || N1.isUndef())
55455 return DAG.getConstant(0, DL, VT);
55456
55457 // ANDNP(0, x) -> x
55459 return N1;
55460
55461 // ANDNP(x, 0) -> 0
55463 return DAG.getConstant(0, DL, VT);
55464
55465 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55467 return DAG.getNOT(DL, N0, VT);
55468
55469 // Turn ANDNP back to AND if input is inverted.
55470 if (SDValue Not = IsNOT(N0, DAG))
55471 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55472
55473 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55474 // to make use of predicated selects.
55475 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55476 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55477 SDValue Src = N0.getOperand(0);
55478 EVT SrcVT = Src.getValueType();
55479 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55480 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55481 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55482 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55483 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55484 getZeroVector(VT, Subtarget, DAG, DL));
55485 }
55486
55487 // Constant Folding
55488 APInt Undefs0, Undefs1;
55489 SmallVector<APInt> EltBits0, EltBits1;
55490 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55491 /*AllowWholeUndefs*/ true,
55492 /*AllowPartialUndefs*/ true)) {
55493 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55494 /*AllowWholeUndefs*/ true,
55495 /*AllowPartialUndefs*/ true)) {
55496 SmallVector<APInt> ResultBits;
55497 for (int I = 0; I != NumElts; ++I)
55498 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55499 return getConstVector(ResultBits, VT, DAG, DL);
55500 }
55501
55502 // Constant fold NOT(N0) to allow us to use AND.
55503 // Ensure this is only performed if we can confirm that the bitcasted source
55504 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55505 if (N0->hasOneUse()) {
55507 if (BC0.getOpcode() != ISD::BITCAST) {
55508 for (APInt &Elt : EltBits0)
55509 Elt = ~Elt;
55510 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55511 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55512 }
55513 }
55514 }
55515
55516 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55517 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55518 SDValue Op(N, 0);
55519 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55520 return Res;
55521
55522 // If either operand is a constant mask, then only the elements that aren't
55523 // zero are actually demanded by the other operand.
55524 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55525 APInt UndefElts;
55526 SmallVector<APInt> EltBits;
55527 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55528 APInt DemandedElts = APInt::getAllOnes(NumElts);
55529 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55530 EltBits)) {
55531 DemandedBits.clearAllBits();
55532 DemandedElts.clearAllBits();
55533 for (int I = 0; I != NumElts; ++I) {
55534 if (UndefElts[I]) {
55535 // We can't assume an undef src element gives an undef dst - the
55536 // other src might be zero.
55537 DemandedBits.setAllBits();
55538 DemandedElts.setBit(I);
55539 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55540 (!Invert && !EltBits[I].isZero())) {
55541 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55542 DemandedElts.setBit(I);
55543 }
55544 }
55545 }
55546 return std::make_pair(DemandedBits, DemandedElts);
55547 };
55548 APInt Bits0, Elts0;
55549 APInt Bits1, Elts1;
55550 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55551 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55552
55553 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55554 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55555 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55556 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55557 if (N->getOpcode() != ISD::DELETED_NODE)
55558 DCI.AddToWorklist(N);
55559 return SDValue(N, 0);
55560 }
55561 }
55562
55563 // Folds for better commutativity:
55564 if (N1->hasOneUse()) {
55565 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55566 if (SDValue Not = IsNOT(N1, DAG))
55567 return DAG.getNOT(
55568 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55569
55570 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55571 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55572 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55574 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55575 EVT ShufVT = BC1.getValueType();
55576 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55577 DAG.getBitcast(ShufVT, N0));
55578 SDValue NewShuf =
55579 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55580 return DAG.getBitcast(VT, NewShuf);
55581 }
55582 }
55583 }
55584
55585 return SDValue();
55586}
55587
55590 SDValue N1 = N->getOperand(1);
55591
55592 // BT ignores high bits in the bit index operand.
55593 unsigned BitWidth = N1.getValueSizeInBits();
55595 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55596 if (N->getOpcode() != ISD::DELETED_NODE)
55597 DCI.AddToWorklist(N);
55598 return SDValue(N, 0);
55599 }
55600
55601 return SDValue();
55602}
55603
55606 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55607 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55608
55609 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55610 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55611 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55612 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55613 if (N->getOpcode() != ISD::DELETED_NODE)
55614 DCI.AddToWorklist(N);
55615 return SDValue(N, 0);
55616 }
55617
55618 // Convert a full vector load into vzload when not all bits are needed.
55619 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55620 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55621 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55622 SDLoc dl(N);
55623 if (IsStrict) {
55624 SDValue Convert = DAG.getNode(
55625 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55626 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55627 DCI.CombineTo(N, Convert, Convert.getValue(1));
55628 } else {
55629 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55630 DAG.getBitcast(MVT::v8i16, VZLoad));
55631 DCI.CombineTo(N, Convert);
55632 }
55633
55634 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55636 return SDValue(N, 0);
55637 }
55638 }
55639 }
55640
55641 return SDValue();
55642}
55643
55644// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55646 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55647
55648 EVT DstVT = N->getValueType(0);
55649
55650 SDValue N0 = N->getOperand(0);
55651 SDValue N1 = N->getOperand(1);
55652 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55653
55654 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55655 return SDValue();
55656
55657 // Look through single use any_extends / truncs.
55658 SDValue IntermediateBitwidthOp;
55659 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55660 N0.hasOneUse()) {
55661 IntermediateBitwidthOp = N0;
55662 N0 = N0.getOperand(0);
55663 }
55664
55665 // See if we have a single use cmov.
55666 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55667 return SDValue();
55668
55669 SDValue CMovOp0 = N0.getOperand(0);
55670 SDValue CMovOp1 = N0.getOperand(1);
55671
55672 // Make sure both operands are constants.
55673 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55674 !isa<ConstantSDNode>(CMovOp1.getNode()))
55675 return SDValue();
55676
55677 SDLoc DL(N);
55678
55679 // If we looked through an any_extend/trunc above, add one to the constants.
55680 if (IntermediateBitwidthOp) {
55681 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55682 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55683 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55684 }
55685
55686 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55687 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55688
55689 EVT CMovVT = DstVT;
55690 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55691 if (DstVT == MVT::i16) {
55692 CMovVT = MVT::i32;
55693 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55694 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55695 }
55696
55697 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55698 N0.getOperand(2), N0.getOperand(3));
55699
55700 if (CMovVT != DstVT)
55701 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55702
55703 return CMov;
55704}
55705
55707 const X86Subtarget &Subtarget) {
55708 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55709
55710 if (SDValue V = combineSextInRegCmov(N, DAG))
55711 return V;
55712
55713 EVT VT = N->getValueType(0);
55714 SDValue N0 = N->getOperand(0);
55715 SDValue N1 = N->getOperand(1);
55716 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55717 SDLoc dl(N);
55718
55719 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55720 // both SSE and AVX2 since there is no sign-extended shift right
55721 // operation on a vector with 64-bit elements.
55722 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55723 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55724 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55725 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55726 SDValue N00 = N0.getOperand(0);
55727
55728 // EXTLOAD has a better solution on AVX2,
55729 // it may be replaced with X86ISD::VSEXT node.
55730 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55731 if (!ISD::isNormalLoad(N00.getNode()))
55732 return SDValue();
55733
55734 // Attempt to promote any comparison mask ops before moving the
55735 // SIGN_EXTEND_INREG in the way.
55736 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55737 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55738
55739 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55740 SDValue Tmp =
55741 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55742 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55743 }
55744 }
55745 return SDValue();
55746}
55747
55748/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55749/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55750/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55751/// opportunities to combine math ops, use an LEA, or use a complex addressing
55752/// mode. This can eliminate extend, add, and shift instructions.
55754 const X86Subtarget &Subtarget) {
55755 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55756 Ext->getOpcode() != ISD::ZERO_EXTEND)
55757 return SDValue();
55758
55759 // TODO: This should be valid for other integer types.
55760 EVT VT = Ext->getValueType(0);
55761 if (VT != MVT::i64)
55762 return SDValue();
55763
55764 SDValue Add = Ext->getOperand(0);
55765 if (Add.getOpcode() != ISD::ADD)
55766 return SDValue();
55767
55768 SDValue AddOp0 = Add.getOperand(0);
55769 SDValue AddOp1 = Add.getOperand(1);
55770 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55771 bool NSW = Add->getFlags().hasNoSignedWrap();
55772 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55773 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55774 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55775
55776 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55777 // into the 'zext'
55778 if ((Sext && !NSW) || (!Sext && !NUW))
55779 return SDValue();
55780
55781 // Having a constant operand to the 'add' ensures that we are not increasing
55782 // the instruction count because the constant is extended for free below.
55783 // A constant operand can also become the displacement field of an LEA.
55784 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55785 if (!AddOp1C)
55786 return SDValue();
55787
55788 // Don't make the 'add' bigger if there's no hope of combining it with some
55789 // other 'add' or 'shl' instruction.
55790 // TODO: It may be profitable to generate simpler LEA instructions in place
55791 // of single 'add' instructions, but the cost model for selecting an LEA
55792 // currently has a high threshold.
55793 bool HasLEAPotential = false;
55794 for (auto *User : Ext->users()) {
55795 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55796 HasLEAPotential = true;
55797 break;
55798 }
55799 }
55800 if (!HasLEAPotential)
55801 return SDValue();
55802
55803 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55804 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55805 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55806 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55807
55808 // The wider add is guaranteed to not wrap because both operands are
55809 // sign-extended.
55810 SDNodeFlags Flags;
55811 Flags.setNoSignedWrap(NSW);
55812 Flags.setNoUnsignedWrap(NUW);
55813 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55814}
55815
55816// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55817// operands and the result of CMOV is not used anywhere else - promote CMOV
55818// itself instead of promoting its result. This could be beneficial, because:
55819// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55820// (or more) pseudo-CMOVs only when they go one-after-another and
55821// getting rid of result extension code after CMOV will help that.
55822// 2) Promotion of constant CMOV arguments is free, hence the
55823// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55824// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55825// promotion is also good in terms of code-size.
55826// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55827// promotion).
55829 SDValue CMovN = Extend->getOperand(0);
55830 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55831 return SDValue();
55832
55833 EVT TargetVT = Extend->getValueType(0);
55834 unsigned ExtendOpcode = Extend->getOpcode();
55835 SDLoc DL(Extend);
55836
55837 EVT VT = CMovN.getValueType();
55838 SDValue CMovOp0 = CMovN.getOperand(0);
55839 SDValue CMovOp1 = CMovN.getOperand(1);
55840
55841 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55842 !isa<ConstantSDNode>(CMovOp1.getNode()))
55843 return SDValue();
55844
55845 // Only extend to i32 or i64.
55846 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55847 return SDValue();
55848
55849 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55850 // are free.
55851 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55852 return SDValue();
55853
55854 // If this a zero extend to i64, we should only extend to i32 and use a free
55855 // zero extend to finish.
55856 EVT ExtendVT = TargetVT;
55857 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55858 ExtendVT = MVT::i32;
55859
55860 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55861 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55862
55863 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55864 CMovN.getOperand(2), CMovN.getOperand(3));
55865
55866 // Finish extending if needed.
55867 if (ExtendVT != TargetVT)
55868 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55869
55870 return Res;
55871}
55872
55873// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55874// result type.
55876 const X86Subtarget &Subtarget) {
55877 SDValue N0 = N->getOperand(0);
55878 EVT VT = N->getValueType(0);
55879 SDLoc dl(N);
55880
55881 // Only do this combine with AVX512 for vector extends.
55882 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55883 return SDValue();
55884
55885 // Only combine legal element types.
55886 EVT SVT = VT.getVectorElementType();
55887 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55888 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55889 return SDValue();
55890
55891 // We don't have CMPP Instruction for vxf16
55892 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55893 return SDValue();
55894 // We can only do this if the vector size in 256 bits or less.
55895 unsigned Size = VT.getSizeInBits();
55896 if (Size > 256 && Subtarget.useAVX512Regs())
55897 return SDValue();
55898
55899 EVT N00VT = N0.getOperand(0).getValueType();
55900
55901 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55902 // that's the only integer compares with we have.
55904 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55905 return SDValue();
55906
55907 // Only do this combine if the extension will be fully consumed by the setcc.
55908 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55909 if (Size != MatchingVecType.getSizeInBits())
55910 return SDValue();
55911
55912 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55913
55914 if (N->getOpcode() == ISD::ZERO_EXTEND)
55915 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55916
55917 return Res;
55918}
55919
55922 const X86Subtarget &Subtarget) {
55923 SDValue N0 = N->getOperand(0);
55924 EVT VT = N->getValueType(0);
55925 SDLoc DL(N);
55926
55927 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55928 if (!DCI.isBeforeLegalizeOps() &&
55930 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55931 N0->getOperand(1));
55932 bool ReplaceOtherUses = !N0.hasOneUse();
55933 DCI.CombineTo(N, Setcc);
55934 // Replace other uses with a truncate of the widened setcc_carry.
55935 if (ReplaceOtherUses) {
55936 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55937 N0.getValueType(), Setcc);
55938 DCI.CombineTo(N0.getNode(), Trunc);
55939 }
55940
55941 return SDValue(N, 0);
55942 }
55943
55944 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55945 return NewCMov;
55946
55947 if (!DCI.isBeforeLegalizeOps())
55948 return SDValue();
55949
55950 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55951 return V;
55952
55953 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55954 DAG, DCI, Subtarget))
55955 return V;
55956
55957 if (VT.isVector()) {
55958 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55959 return R;
55960
55962 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55963 }
55964
55965 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55966 return NewAdd;
55967
55968 return SDValue();
55969}
55970
55971// Inverting a constant vector is profitable if it can be eliminated and the
55972// inverted vector is already present in DAG. Otherwise, it will be loaded
55973// anyway.
55974//
55975// We determine which of the values can be completely eliminated and invert it.
55976// If both are eliminable, select a vector with the first negative element.
55979 "ConstantFP build vector expected");
55980 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55981 // can eliminate it. Since this function is invoked for each FMA with this
55982 // vector.
55983 auto IsNotFMA = [](SDNode *User) {
55984 return User->getOpcode() != ISD::FMA &&
55985 User->getOpcode() != ISD::STRICT_FMA;
55986 };
55987 if (llvm::any_of(V->users(), IsNotFMA))
55988 return SDValue();
55989
55991 EVT VT = V.getValueType();
55992 EVT EltVT = VT.getVectorElementType();
55993 for (const SDValue &Op : V->op_values()) {
55994 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55995 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55996 } else {
55997 assert(Op.isUndef());
55998 Ops.push_back(DAG.getUNDEF(EltVT));
55999 }
56000 }
56001
56003 if (!NV)
56004 return SDValue();
56005
56006 // If an inverted version cannot be eliminated, choose it instead of the
56007 // original version.
56008 if (llvm::any_of(NV->users(), IsNotFMA))
56009 return SDValue(NV, 0);
56010
56011 // If the inverted version also can be eliminated, we have to consistently
56012 // prefer one of the values. We prefer a constant with a negative value on
56013 // the first place.
56014 // N.B. We need to skip undefs that may precede a value.
56015 for (const SDValue &Op : V->op_values()) {
56016 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56017 if (Cst->isNegative())
56018 return SDValue();
56019 break;
56020 }
56021 }
56022 return SDValue(NV, 0);
56023}
56024
56027 const X86Subtarget &Subtarget) {
56028 SDLoc dl(N);
56029 EVT VT = N->getValueType(0);
56031 bool IsStrict = N->isTargetOpcode()
56032 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56033 : N->isStrictFPOpcode();
56034
56035 // Let legalize expand this if it isn't a legal type yet.
56036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56037 if (!TLI.isTypeLegal(VT))
56038 return SDValue();
56039
56040 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56041 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56042 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56043
56044 // If the operation allows fast-math and the target does not support FMA,
56045 // split this into mul+add to avoid libcall(s).
56046 SDNodeFlags Flags = N->getFlags();
56047 if (!IsStrict && Flags.hasAllowReassociation() &&
56048 TLI.isOperationExpand(ISD::FMA, VT)) {
56049 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56050 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56051 }
56052
56053 EVT ScalarVT = VT.getScalarType();
56054 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56055 !Subtarget.hasAnyFMA()) &&
56056 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56057 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56058 return SDValue();
56059
56060 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56062 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56063 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56064 CodeSize)) {
56065 V = NegV;
56066 return true;
56067 }
56068 // Look through extract_vector_elts. If it comes from an FNEG, create a
56069 // new extract from the FNEG input.
56070 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56071 isNullConstant(V.getOperand(1))) {
56072 SDValue Vec = V.getOperand(0);
56073 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56074 Vec, DAG, LegalOperations, CodeSize)) {
56075 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56076 NegV, V.getOperand(1));
56077 return true;
56078 }
56079 }
56080 // Lookup if there is an inverted version of constant vector V in DAG.
56081 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56082 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56083 V = NegV;
56084 return true;
56085 }
56086 }
56087 return false;
56088 };
56089
56090 // Do not convert the passthru input of scalar intrinsics.
56091 // FIXME: We could allow negations of the lower element only.
56092 bool NegA = invertIfNegative(A);
56093 // Create a dummy use for A so that in the process of negating B or C
56094 // recursively, it is not deleted.
56095 HandleSDNode NegAHandle(A);
56096 bool NegB = invertIfNegative(B);
56097 // Similar to A, get a handle on B.
56098 HandleSDNode NegBHandle(B);
56099 bool NegC = invertIfNegative(C);
56100
56101 if (!NegA && !NegB && !NegC)
56102 return SDValue();
56103
56104 unsigned NewOpcode =
56105 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56106
56107 // Propagate fast-math-flags to new FMA node.
56108 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56109 if (IsStrict) {
56110 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56111 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56112 {N->getOperand(0), A, B, C});
56113 } else {
56114 if (N->getNumOperands() == 4)
56115 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56116 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56117 }
56118}
56119
56120// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56121// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56124 SDLoc dl(N);
56125 EVT VT = N->getValueType(0);
56126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56128 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56129
56130 SDValue N2 = N->getOperand(2);
56131
56132 SDValue NegN2 =
56133 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56134 if (!NegN2)
56135 return SDValue();
56136 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56137
56138 if (N->getNumOperands() == 4)
56139 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56140 NegN2, N->getOperand(3));
56141 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56142 NegN2);
56143}
56144
56145// Try to widen the build vector and bitcast it to the type of zext.
56146// This is a special case for the 128-bit vector types. Intention is to remove
56147// the zext and replace it with a bitcast the wider type. While lowering
56148// the bitcast is removed and extra commutation due to zext is avoided.
56149// For example:
56150// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56151// build_vector (x, 0, y, 0, z, w, 0)
56153
56154 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56155 return SDValue();
56156
56157 EVT ExtendVT = Extend->getValueType(0);
56158
56159 SDValue BV = Extend->getOperand(0);
56160 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56161 return SDValue();
56162
56163 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56164 // If the build vector has undef elements, we cannot widen it.
56165 // The widening would create a vector with more undef elements, which
56166 // is not valid.
56167 return SDValue();
56168 }
56169
56170 if (!all_of(BV->op_values(),
56171 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56172 // If the build vector any element other than \ISD::LOAD, we cannot widen
56173 // it.
56174 return SDValue();
56175 }
56176
56177 SDLoc dl(BV);
56178 EVT VT = BV.getValueType();
56179 EVT EltVT = BV.getOperand(0).getValueType();
56180 unsigned NumElts = VT.getVectorNumElements();
56181
56182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56183
56184 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56186 return SDValue();
56187
56188 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56189 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56190
56191 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56192 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56193 // Fill the new elements with Zero.
56194 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56195 // Compute the step to place the elements in the right place and control the
56196 // iteration.
56197 unsigned step = WidenNumElts / NumElts;
56198 if (WidenVT.is128BitVector()) {
56199 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56200 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56201 i--, j -= step) {
56202 SDValue temp = NewOps[i];
56203 NewOps[i] = NewOps[j];
56204 NewOps[j] = temp;
56205 }
56206 // Create new build vector with WidenVT and NewOps
56207 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56208 // Replace the old build vector with the new one. Bitcast the
56209 // new build vector to the type of the zext.
56210 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56211 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56212 return NewBV;
56213 }
56214 }
56215 return SDValue();
56216}
56217
56220 const X86Subtarget &Subtarget) {
56221 SDLoc dl(N);
56222 SDValue N0 = N->getOperand(0);
56223 EVT VT = N->getValueType(0);
56224
56225 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56226 // FIXME: Is this needed? We don't seem to have any tests for it.
56227 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56229 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56230 N0->getOperand(1));
56231 bool ReplaceOtherUses = !N0.hasOneUse();
56232 DCI.CombineTo(N, Setcc);
56233 // Replace other uses with a truncate of the widened setcc_carry.
56234 if (ReplaceOtherUses) {
56235 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56236 N0.getValueType(), Setcc);
56237 DCI.CombineTo(N0.getNode(), Trunc);
56238 }
56239
56240 return SDValue(N, 0);
56241 }
56242
56243 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56244 return NewCMov;
56245
56246 if (DCI.isBeforeLegalizeOps())
56247 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56248 return V;
56249
56250 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56251 DAG, DCI, Subtarget))
56252 return V;
56253
56254 if (VT.isVector())
56255 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56256 return R;
56257
56258 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56259 return NewAdd;
56260
56261 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56262 return R;
56263
56264 // TODO: Combine with any target/faux shuffle.
56265 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56267 SDValue N00 = N0.getOperand(0);
56268 SDValue N01 = N0.getOperand(1);
56269 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56270 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56271 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56272 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56273 return concatSubVectors(N00, N01, DAG, dl);
56274 }
56275 }
56276
56277 if (SDValue V = widenBuildVec(N, DAG))
56278 return V;
56279
56280 return SDValue();
56281}
56282
56283/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56284/// pre-promote its result type since vXi1 vectors don't get promoted
56285/// during type legalization.
56288 const SDLoc &DL, SelectionDAG &DAG,
56289 const X86Subtarget &Subtarget) {
56290 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56291 VT.getVectorElementType() == MVT::i1 &&
56292 (OpVT.getVectorElementType() == MVT::i8 ||
56293 OpVT.getVectorElementType() == MVT::i16)) {
56294 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56295 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56296 }
56297 return SDValue();
56298}
56299
56300// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56301// eq/ne) is generated when using an integer as a mask. Instead of generating a
56302// broadcast + vptest, we can directly move the integer to a mask register.
56304 const SDLoc &DL, SelectionDAG &DAG,
56305 const X86Subtarget &Subtarget) {
56306 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56307 return SDValue();
56308
56309 if (!Subtarget.hasAVX512())
56310 return SDValue();
56311
56312 if (Op0.getOpcode() != ISD::AND)
56313 return SDValue();
56314
56315 SDValue Broadcast = Op0.getOperand(0);
56316 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56317 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56318 return SDValue();
56319
56320 SDValue Load = Op0.getOperand(1);
56321 EVT LoadVT = Load.getSimpleValueType();
56322
56323 APInt UndefElts;
56324 SmallVector<APInt, 32> EltBits;
56326 UndefElts, EltBits,
56327 /*AllowWholeUndefs*/ true,
56328 /*AllowPartialUndefs*/ false) ||
56329 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56330 return SDValue();
56331
56332 // Check if the constant pool contains only powers of 2 starting from some
56333 // 2^N. The table may also contain undefs because of widening of vector
56334 // operands.
56335 unsigned N = EltBits[0].logBase2();
56336 unsigned Len = UndefElts.getBitWidth();
56337 for (unsigned I = 1; I != Len; ++I) {
56338 if (UndefElts[I]) {
56339 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56340 return SDValue();
56341 break;
56342 }
56343
56344 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56345 return SDValue();
56346 }
56347
56348 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56349 SDValue BroadcastOp;
56350 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56351 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56352 Broadcast, DAG.getVectorIdxConstant(0, DL));
56353 } else {
56354 BroadcastOp = Broadcast.getOperand(0);
56355 if (BroadcastOp.getValueType().isVector())
56356 return SDValue();
56357 }
56358
56359 SDValue Masked = BroadcastOp;
56360 if (N != 0) {
56361 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56362 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56363
56364 if (NumDefinedElts > BroadcastOpBitWidth)
56365 return SDValue();
56366
56367 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56368 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56369 DAG.getConstant(N, DL, BroadcastOpVT));
56370 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56371 DAG.getConstant(Mask, DL, BroadcastOpVT));
56372 }
56373 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56374 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56375 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56376 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56377
56378 if (CC == ISD::SETEQ)
56379 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56380
56381 if (VT != MVT::v16i1)
56382 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56383 DAG.getVectorIdxConstant(0, DL));
56384
56385 return Bitcast;
56386}
56387
56390 const X86Subtarget &Subtarget) {
56391 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56392 const SDValue LHS = N->getOperand(0);
56393 const SDValue RHS = N->getOperand(1);
56394 EVT VT = N->getValueType(0);
56395 EVT OpVT = LHS.getValueType();
56396 SDLoc DL(N);
56397
56398 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56399 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56400 Subtarget))
56401 return V;
56402 }
56403
56404 if (VT == MVT::i1) {
56405 X86::CondCode X86CC;
56406 if (SDValue V =
56407 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56408 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56409 }
56410
56411 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56412 if (OpVT.isScalarInteger()) {
56413 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56414 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56415 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56416 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56417 if (N0.getOperand(0) == N1)
56418 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56419 N0.getOperand(1));
56420 if (N0.getOperand(1) == N1)
56421 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56422 N0.getOperand(0));
56423 }
56424 return SDValue();
56425 };
56426 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56427 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56428 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56429 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56430
56431 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56432 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56433 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56434 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56435 if (N0.getOperand(0) == N1)
56436 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56437 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56438 if (N0.getOperand(1) == N1)
56439 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56440 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56441 }
56442 return SDValue();
56443 };
56444 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56445 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56446 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56447 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56448
56449 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56450 // cmpne(trunc(x),C) --> cmpne(x,C)
56451 // iff x upper bits are zero.
56452 if (LHS.getOpcode() == ISD::TRUNCATE &&
56453 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56455 EVT SrcVT = LHS.getOperand(0).getValueType();
56457 OpVT.getScalarSizeInBits());
56458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56459 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56460 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56461 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56462 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56463 }
56464
56465 // With C as a power of 2 and C != 0 and C != INT_MIN:
56466 // icmp eq Abs(X) C ->
56467 // (icmp eq A, C) | (icmp eq A, -C)
56468 // icmp ne Abs(X) C ->
56469 // (icmp ne A, C) & (icmp ne A, -C)
56470 // Both of these patterns can be better optimized in
56471 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56472 // integers which is checked above.
56473 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56474 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56475 const APInt &CInt = C->getAPIntValue();
56476 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56477 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56478 SDValue BaseOp = LHS.getOperand(0);
56479 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56480 SDValue SETCC1 = DAG.getSetCC(
56481 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56482 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56483 SETCC0, SETCC1);
56484 }
56485 }
56486 }
56487 }
56488 }
56489
56490 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56491 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56492 // Using temporaries to avoid messing up operand ordering for later
56493 // transformations if this doesn't work.
56494 SDValue Op0 = LHS;
56495 SDValue Op1 = RHS;
56496 ISD::CondCode TmpCC = CC;
56497 // Put build_vector on the right.
56498 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56499 std::swap(Op0, Op1);
56500 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56501 }
56502
56503 bool IsSEXT0 =
56504 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56505 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56506 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56507
56508 if (IsSEXT0 && IsVZero1) {
56509 assert(VT == Op0.getOperand(0).getValueType() &&
56510 "Unexpected operand type");
56511 if (TmpCC == ISD::SETGT)
56512 return DAG.getConstant(0, DL, VT);
56513 if (TmpCC == ISD::SETLE)
56514 return DAG.getConstant(1, DL, VT);
56515 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56516 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56517
56518 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56519 "Unexpected condition code!");
56520 return Op0.getOperand(0);
56521 }
56522
56523 if (IsVZero1)
56524 if (SDValue V =
56525 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56526 return V;
56527 }
56528
56529 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56530 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56531 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56532 // a mask, there are signed AVX512 comparisons).
56533 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56534 bool CanMakeSigned = false;
56535 if (ISD::isUnsignedIntSetCC(CC)) {
56536 KnownBits CmpKnown =
56538 // If we know LHS/RHS share the same sign bit at each element we can
56539 // make this signed.
56540 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56541 // across all lanes. So a pattern where the sign varies from lane to
56542 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56543 // missed. We could get around this by demanding each lane
56544 // independently, but this isn't the most important optimization and
56545 // that may eat into compile time.
56546 CanMakeSigned =
56547 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56548 }
56549 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56550 SDValue LHSOut = LHS;
56551 SDValue RHSOut = RHS;
56552 ISD::CondCode NewCC = CC;
56553 switch (CC) {
56554 case ISD::SETGE:
56555 case ISD::SETUGE:
56556 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56557 /*NSW*/ true))
56558 LHSOut = NewLHS;
56559 else if (SDValue NewRHS = incDecVectorConstant(
56560 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56561 RHSOut = NewRHS;
56562 else
56563 break;
56564
56565 [[fallthrough]];
56566 case ISD::SETUGT:
56567 NewCC = ISD::SETGT;
56568 break;
56569
56570 case ISD::SETLE:
56571 case ISD::SETULE:
56572 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56573 /*NSW*/ true))
56574 LHSOut = NewLHS;
56575 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56576 /*NSW*/ true))
56577 RHSOut = NewRHS;
56578 else
56579 break;
56580
56581 [[fallthrough]];
56582 case ISD::SETULT:
56583 // Will be swapped to SETGT in LowerVSETCC*.
56584 NewCC = ISD::SETLT;
56585 break;
56586 default:
56587 break;
56588 }
56589 if (NewCC != CC) {
56590 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56591 NewCC, DL, DAG, Subtarget))
56592 return R;
56593 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56594 }
56595 }
56596 }
56597
56598 if (SDValue R =
56599 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56600 return R;
56601
56602 // In the middle end transforms:
56603 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56604 // -> `(icmp ult (add x, -C), 2)`
56605 // Likewise inverted cases with `ugt`.
56606 //
56607 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56608 // in worse codegen. So, undo the middle-end transform and go back to `(or
56609 // (icmp eq), (icmp eq))` form.
56610 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56611 // the xmm approach.
56612 //
56613 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56614 // ne))` as it doesn't end up instruction positive.
56615 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56616 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56617 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56618 !Subtarget.hasAVX512() &&
56619 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56620 Subtarget.hasAVX2()) &&
56621 LHS.hasOneUse()) {
56622
56623 APInt CmpC;
56624 SDValue AddC = LHS.getOperand(1);
56625 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56627 // See which form we have depending on the constant/condition.
56628 SDValue C0 = SDValue();
56629 SDValue C1 = SDValue();
56630
56631 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56632 // we will end up generating an additional constant. Keeping in the
56633 // current form has a slight latency cost, but it probably worth saving a
56634 // constant.
56637 // Pass
56638 }
56639 // Normal Cases
56640 else if ((CC == ISD::SETULT && CmpC == 2) ||
56641 (CC == ISD::SETULE && CmpC == 1)) {
56642 // These will constant fold.
56643 C0 = DAG.getNegative(AddC, DL, OpVT);
56644 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56645 DAG.getAllOnesConstant(DL, OpVT));
56646 }
56647 // Inverted Cases
56648 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56649 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56650 // These will constant fold.
56651 C0 = DAG.getNOT(DL, AddC, OpVT);
56652 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56653 DAG.getAllOnesConstant(DL, OpVT));
56654 }
56655 if (C0 && C1) {
56656 SDValue NewLHS =
56657 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56658 SDValue NewRHS =
56659 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56660 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56661 }
56662 }
56663 }
56664
56665 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56666 // to avoid scalarization via legalization because v4i32 is not a legal type.
56667 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56668 LHS.getValueType() == MVT::v4f32)
56669 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56670
56671 // X pred 0.0 --> X pred -X
56672 // If the negation of X already exists, use it in the comparison. This removes
56673 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56674 // instructions in patterns with a 'select' node.
56676 SDVTList FNegVT = DAG.getVTList(OpVT);
56677 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56678 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56679 }
56680
56681 return SDValue();
56682}
56683
56686 const X86Subtarget &Subtarget) {
56687 SDValue Src = N->getOperand(0);
56688 MVT SrcVT = Src.getSimpleValueType();
56689 MVT VT = N->getSimpleValueType(0);
56690 unsigned NumBits = VT.getScalarSizeInBits();
56691 unsigned NumElts = SrcVT.getVectorNumElements();
56692 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56693 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56694
56695 // Perform constant folding.
56696 APInt UndefElts;
56697 SmallVector<APInt, 32> EltBits;
56698 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56699 /*AllowWholeUndefs*/ true,
56700 /*AllowPartialUndefs*/ true)) {
56701 APInt Imm(32, 0);
56702 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56703 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56704 Imm.setBit(Idx);
56705
56706 return DAG.getConstant(Imm, SDLoc(N), VT);
56707 }
56708
56709 // Look through int->fp bitcasts that don't change the element width.
56710 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56711 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56712 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56713 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56714
56715 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56716 // with scalar comparisons.
56717 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56718 SDLoc DL(N);
56719 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56720 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56721 return DAG.getNode(ISD::XOR, DL, VT,
56722 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56723 DAG.getConstant(NotMask, DL, VT));
56724 }
56725
56726 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56727 // results with scalar comparisons.
56728 if (Src.getOpcode() == X86ISD::PCMPGT &&
56729 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56730 SDLoc DL(N);
56731 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56732 return DAG.getNode(ISD::XOR, DL, VT,
56733 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56734 DAG.getConstant(NotMask, DL, VT));
56735 }
56736
56737 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56738 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56739 // iff pow2splat(c1).
56740 // Use KnownBits to determine if only a single bit is non-zero
56741 // in each element (pow2 or zero), and shift that bit to the msb.
56742 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56743 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56744 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56745 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56746 if (KnownLHS.countMaxPopulation() == 1 &&
56747 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56748 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56749 SDLoc DL(N);
56750 MVT ShiftVT = SrcVT;
56751 SDValue ShiftLHS = Src.getOperand(0);
56752 SDValue ShiftRHS = Src.getOperand(1);
56753 if (ShiftVT.getScalarType() == MVT::i8) {
56754 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56755 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56756 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56757 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56758 }
56759 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56760 ShiftLHS, ShiftAmt, DAG);
56761 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56762 ShiftRHS, ShiftAmt, DAG);
56763 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56764 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56765 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56766 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56767 }
56768 }
56769
56770 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56771 if (N->isOnlyUserOf(Src.getNode())) {
56773 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56774 APInt UndefElts;
56775 SmallVector<APInt, 32> EltBits;
56776 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56777 UndefElts, EltBits)) {
56778 APInt Mask = APInt::getZero(NumBits);
56779 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56780 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56781 Mask.setBit(Idx);
56782 }
56783 SDLoc DL(N);
56784 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56785 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56786 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56787 DAG.getConstant(Mask, DL, VT));
56788 }
56789 }
56790 }
56791
56792 // Simplify the inputs.
56793 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56794 APInt DemandedMask(APInt::getAllOnes(NumBits));
56795 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56796 return SDValue(N, 0);
56797
56798 return SDValue();
56799}
56800
56803 const X86Subtarget &Subtarget) {
56804 MVT VT = N->getSimpleValueType(0);
56805 unsigned NumBits = VT.getScalarSizeInBits();
56806
56807 // Simplify the inputs.
56808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56809 APInt DemandedMask(APInt::getAllOnes(NumBits));
56810 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56811 return SDValue(N, 0);
56812
56813 return SDValue();
56814}
56815
56819 SDValue Mask = MemOp->getMask();
56820
56821 // With vector masks we only demand the upper bit of the mask.
56822 if (Mask.getScalarValueSizeInBits() != 1) {
56823 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56824 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56825 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56826 if (N->getOpcode() != ISD::DELETED_NODE)
56827 DCI.AddToWorklist(N);
56828 return SDValue(N, 0);
56829 }
56830 }
56831
56832 return SDValue();
56833}
56834
56836 SDValue Index, SDValue Base, SDValue Scale,
56837 SelectionDAG &DAG) {
56838 SDLoc DL(GorS);
56839
56840 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56841 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56842 Gather->getMask(), Base, Index, Scale } ;
56843 return DAG.getMaskedGather(Gather->getVTList(),
56844 Gather->getMemoryVT(), DL, Ops,
56845 Gather->getMemOperand(),
56846 Gather->getIndexType(),
56847 Gather->getExtensionType());
56848 }
56849 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56850 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56851 Scatter->getMask(), Base, Index, Scale };
56852 return DAG.getMaskedScatter(Scatter->getVTList(),
56853 Scatter->getMemoryVT(), DL,
56854 Ops, Scatter->getMemOperand(),
56855 Scatter->getIndexType(),
56856 Scatter->isTruncatingStore());
56857}
56858
56861 SDLoc DL(N);
56862 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56863 SDValue Index = GorS->getIndex();
56864 SDValue Base = GorS->getBasePtr();
56865 SDValue Scale = GorS->getScale();
56866 EVT IndexVT = Index.getValueType();
56867 EVT IndexSVT = IndexVT.getVectorElementType();
56868 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56869 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56870 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56871
56872 if (DCI.isBeforeLegalize()) {
56873 // Attempt to move shifted index into the address scale, allows further
56874 // index truncation below.
56875 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56876 isa<ConstantSDNode>(Scale)) {
56877 unsigned ScaleAmt = Scale->getAsZExtVal();
56878 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56879 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56880 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56881 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56882 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56883 if (N->getOpcode() != ISD::DELETED_NODE)
56884 DCI.AddToWorklist(N);
56885 return SDValue(N, 0);
56886 }
56887 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56888 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56889 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56890 SDValue ShAmt = Index.getOperand(1);
56891 SDValue NewShAmt =
56892 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56893 DAG.getConstant(1, DL, ShAmt.getValueType()));
56894 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56895 Index.getOperand(0), NewShAmt);
56896 SDValue NewScale =
56897 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56898 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56899 }
56900 }
56901 }
56902
56903 // Shrink indices if they are larger than 32-bits.
56904 // Only do this before legalize types since v2i64 could become v2i32.
56905 // FIXME: We could check that the type is legal if we're after legalize
56906 // types, but then we would need to construct test cases where that happens.
56907 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56908 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56909
56910 // FIXME: We could support more than just constant fold, but we need to
56911 // careful with costing. A truncate that can be optimized out would be
56912 // fine. Otherwise we might only want to create a truncate if it avoids
56913 // a split.
56914 if (SDValue TruncIndex =
56915 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56916 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56917
56918 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56919 // there are sufficient sign bits. Only do this before legalize types to
56920 // avoid creating illegal types in truncate.
56921 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56922 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56923 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56924 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56925 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56926 }
56927
56928 // Shrink if we remove an illegal type.
56929 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56930 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56931 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56932 }
56933 }
56934 }
56935
56936 // Try to move splat adders from the index operand to the base
56937 // pointer operand. Taking care to multiply by the scale. We can only do
56938 // this when index element type is the same as the pointer type.
56939 // Otherwise we need to be sure the math doesn't wrap before the scale.
56940 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56941 isa<ConstantSDNode>(Scale)) {
56942 uint64_t ScaleAmt = Scale->getAsZExtVal();
56943
56944 for (unsigned I = 0; I != 2; ++I)
56945 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56946 BitVector UndefElts;
56947 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56948 if (UndefElts.none()) {
56949 // If the splat value is constant we can add the scaled splat value
56950 // to the existing base.
56951 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56952 APInt Adder = C->getAPIntValue() * ScaleAmt;
56953 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56954 DAG.getConstant(Adder, DL, PtrVT));
56955 SDValue NewIndex = Index.getOperand(1 - I);
56956 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56957 }
56958 // For non-constant cases, limit this to non-scaled cases.
56959 if (ScaleAmt == 1) {
56960 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56961 SDValue NewIndex = Index.getOperand(1 - I);
56962 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56963 }
56964 }
56965 }
56966 // It's also possible base is just a constant. In that case, just
56967 // replace it with 0 and move the displacement into the index.
56968 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56969 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56970 // Combine the constant build_vector and the constant base.
56971 Splat =
56972 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56973 // Add to the other half of the original Index add.
56974 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56975 Index.getOperand(1 - I), Splat);
56976 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56977 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56978 }
56979 }
56980 }
56981
56982 if (DCI.isBeforeLegalizeOps()) {
56983 // Make sure the index is either i32 or i64
56984 if (IndexWidth != 32 && IndexWidth != 64) {
56985 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56986 IndexVT = IndexVT.changeVectorElementType(EltVT);
56987 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56988 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56989 }
56990 }
56991
56992 // With vector masks we only demand the upper bit of the mask.
56993 SDValue Mask = GorS->getMask();
56994 if (Mask.getScalarValueSizeInBits() != 1) {
56995 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56996 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56997 if (N->getOpcode() != ISD::DELETED_NODE)
56998 DCI.AddToWorklist(N);
56999 return SDValue(N, 0);
57000 }
57001 }
57002
57003 return SDValue();
57004}
57005
57006// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57008 const X86Subtarget &Subtarget) {
57009 SDLoc DL(N);
57010 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57011 SDValue EFLAGS = N->getOperand(1);
57012
57013 // Try to simplify the EFLAGS and condition code operands.
57014 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57015 return getSETCC(CC, Flags, DL, DAG);
57016
57017 return SDValue();
57018}
57019
57020/// Optimize branch condition evaluation.
57022 const X86Subtarget &Subtarget) {
57023 SDLoc DL(N);
57024 SDValue EFLAGS = N->getOperand(3);
57025 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57026
57027 // Try to simplify the EFLAGS and condition code operands.
57028 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57029 // RAUW them under us.
57030 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57031 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57032 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57033 N->getOperand(1), Cond, Flags);
57034 }
57035
57036 return SDValue();
57037}
57038
57039// TODO: Could we move this to DAGCombine?
57041 SelectionDAG &DAG) {
57042 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57043 // to optimize away operation when it's from a constant.
57044 //
57045 // The general transformation is:
57046 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57047 // AND(VECTOR_CMP(x,y), constant2)
57048 // constant2 = UNARYOP(constant)
57049
57050 // Early exit if this isn't a vector operation, the operand of the
57051 // unary operation isn't a bitwise AND, or if the sizes of the operations
57052 // aren't the same.
57053 EVT VT = N->getValueType(0);
57054 bool IsStrict = N->isStrictFPOpcode();
57055 unsigned NumEltBits = VT.getScalarSizeInBits();
57056 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57057 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57058 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57059 VT.getSizeInBits() != Op0.getValueSizeInBits())
57060 return SDValue();
57061
57062 // Now check that the other operand of the AND is a constant. We could
57063 // make the transformation for non-constant splats as well, but it's unclear
57064 // that would be a benefit as it would not eliminate any operations, just
57065 // perform one more step in scalar code before moving to the vector unit.
57066 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57067 // Bail out if the vector isn't a constant.
57068 if (!BV->isConstant())
57069 return SDValue();
57070
57071 // Everything checks out. Build up the new and improved node.
57072 SDLoc DL(N);
57073 EVT IntVT = BV->getValueType(0);
57074 // Create a new constant of the appropriate type for the transformed
57075 // DAG.
57076 SDValue SourceConst;
57077 if (IsStrict)
57078 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57079 {N->getOperand(0), SDValue(BV, 0)});
57080 else
57081 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57082 // The AND node needs bitcasts to/from an integer vector type around it.
57083 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57084 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57085 MaskConst);
57086 SDValue Res = DAG.getBitcast(VT, NewAnd);
57087 if (IsStrict)
57088 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57089 return Res;
57090 }
57091
57092 return SDValue();
57093}
57094
57095/// If we are converting a value to floating-point, try to replace scalar
57096/// truncate of an extracted vector element with a bitcast. This tries to keep
57097/// the sequence on XMM registers rather than moving between vector and GPRs.
57099 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57100 // to allow being called by any similar cast opcode.
57101 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57102 SDValue Trunc = N->getOperand(0);
57103 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57104 return SDValue();
57105
57106 SDValue ExtElt = Trunc.getOperand(0);
57107 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57108 !isNullConstant(ExtElt.getOperand(1)))
57109 return SDValue();
57110
57111 EVT TruncVT = Trunc.getValueType();
57112 EVT SrcVT = ExtElt.getValueType();
57113 unsigned DestWidth = TruncVT.getSizeInBits();
57114 unsigned SrcWidth = SrcVT.getSizeInBits();
57115 if (SrcWidth % DestWidth != 0)
57116 return SDValue();
57117
57118 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57119 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57120 unsigned VecWidth = SrcVecVT.getSizeInBits();
57121 unsigned NumElts = VecWidth / DestWidth;
57122 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57123 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57124 SDLoc DL(N);
57125 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57126 BitcastVec, ExtElt.getOperand(1));
57127 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57128}
57129
57131 const X86Subtarget &Subtarget) {
57132 bool IsStrict = N->isStrictFPOpcode();
57133 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57134 EVT VT = N->getValueType(0);
57135 EVT InVT = Op0.getValueType();
57136
57137 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57138 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57139 // if hasFP16 support:
57140 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57141 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57142 // else
57143 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57144 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57145 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57146 unsigned ScalarSize = InVT.getScalarSizeInBits();
57147 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57148 ScalarSize >= 64)
57149 return SDValue();
57150 SDLoc dl(N);
57151 EVT DstVT =
57153 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57154 : ScalarSize < 32 ? MVT::i32
57155 : MVT::i64,
57156 InVT.getVectorNumElements());
57157 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57158 if (IsStrict)
57159 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57160 {N->getOperand(0), P});
57161 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57162 }
57163
57164 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57165 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57166 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57167 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57168 VT.getScalarType() != MVT::f16) {
57169 SDLoc dl(N);
57170 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57171 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57172
57173 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57174 if (IsStrict)
57175 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57176 {N->getOperand(0), P});
57177 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57178 }
57179
57180 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57181 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57182 // the optimization here.
57183 SDNodeFlags Flags = N->getFlags();
57184 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57185 if (IsStrict)
57186 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57187 {N->getOperand(0), Op0});
57188 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57189 }
57190
57191 return SDValue();
57192}
57193
57196 const X86Subtarget &Subtarget) {
57197 // First try to optimize away the conversion entirely when it's
57198 // conditionally from a constant. Vectors only.
57199 bool IsStrict = N->isStrictFPOpcode();
57201 return Res;
57202
57203 // Now move on to more general possibilities.
57204 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57205 EVT VT = N->getValueType(0);
57206 EVT InVT = Op0.getValueType();
57207
57208 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57209 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57210 // if hasFP16 support:
57211 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57212 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57213 // else
57214 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57215 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57216 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57217 unsigned ScalarSize = InVT.getScalarSizeInBits();
57218 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57219 ScalarSize >= 64)
57220 return SDValue();
57221 SDLoc dl(N);
57222 EVT DstVT =
57224 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57225 : ScalarSize < 32 ? MVT::i32
57226 : MVT::i64,
57227 InVT.getVectorNumElements());
57228 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57229 if (IsStrict)
57230 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57231 {N->getOperand(0), P});
57232 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57233 }
57234
57235 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57236 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57237 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57238 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57239 VT.getScalarType() != MVT::f16) {
57240 SDLoc dl(N);
57241 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57242 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57243 if (IsStrict)
57244 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57245 {N->getOperand(0), P});
57246 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57247 }
57248
57249 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57250 // vectors and scalars, see if we know that the upper bits are all the sign
57251 // bit, in which case we can truncate the input to i32 and convert from that.
57252 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57253 unsigned BitWidth = InVT.getScalarSizeInBits();
57254 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57255 if (NumSignBits >= (BitWidth - 31)) {
57256 EVT TruncVT = MVT::i32;
57257 if (InVT.isVector())
57258 TruncVT = InVT.changeVectorElementType(TruncVT);
57259 SDLoc dl(N);
57260 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57261 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57262 if (IsStrict)
57263 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57264 {N->getOperand(0), Trunc});
57265 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57266 }
57267 // If we're after legalize and the type is v2i32 we need to shuffle and
57268 // use CVTSI2P.
57269 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57270 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57271 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57272 { 0, 2, -1, -1 });
57273 if (IsStrict)
57274 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57275 {N->getOperand(0), Shuf});
57276 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57277 }
57278 }
57279
57280 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57281 // a 32-bit target where SSE doesn't support i64->FP operations.
57282 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57283 Op0.getOpcode() == ISD::LOAD) {
57284 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57285
57286 // This transformation is not supported if the result type is f16 or f128.
57287 if (VT == MVT::f16 || VT == MVT::f128)
57288 return SDValue();
57289
57290 // If we have AVX512DQ we can use packed conversion instructions unless
57291 // the VT is f80.
57292 if (Subtarget.hasDQI() && VT != MVT::f80)
57293 return SDValue();
57294
57295 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57296 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57297 std::pair<SDValue, SDValue> Tmp =
57298 Subtarget.getTargetLowering()->BuildFILD(
57299 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57300 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57301 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57302 return Tmp.first;
57303 }
57304 }
57305
57306 if (IsStrict)
57307 return SDValue();
57308
57309 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57310 return V;
57311
57312 return SDValue();
57313}
57314
57316 const X86Subtarget &Subtarget) {
57317 EVT VT = N->getValueType(0);
57318 SDValue Src = N->getOperand(0);
57319 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57320 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57321 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57322
57323 return SDValue();
57324}
57325
57326// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57328 const X86Subtarget &Subtarget) {
57329 if (!Subtarget.hasAVX10_2())
57330 return SDValue();
57331
57332 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57333 EVT SrcVT = N->getOperand(0).getValueType();
57334 EVT DstVT = N->getValueType(0);
57335 SDLoc dl(N);
57336
57337 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57338 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57339
57340 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57341 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57342 N->getOperand(0), V2F32Value);
57343
57344 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57345 if (IsSigned)
57346 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57347
57348 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57349 }
57350 return SDValue();
57351}
57352
57354 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57355
57356 for (const SDNode *User : Flags->users()) {
57357 X86::CondCode CC;
57358 switch (User->getOpcode()) {
57359 default:
57360 // Be conservative.
57361 return true;
57362 case X86ISD::SETCC:
57364 CC = (X86::CondCode)User->getConstantOperandVal(0);
57365 break;
57366 case X86ISD::BRCOND:
57367 case X86ISD::CMOV:
57368 CC = (X86::CondCode)User->getConstantOperandVal(2);
57369 break;
57370 }
57371
57372 switch (CC) {
57373 // clang-format off
57374 default: break;
57375 case X86::COND_A: case X86::COND_AE:
57376 case X86::COND_B: case X86::COND_BE:
57377 case X86::COND_O: case X86::COND_NO:
57378 case X86::COND_G: case X86::COND_GE:
57379 case X86::COND_L: case X86::COND_LE:
57380 return true;
57381 // clang-format on
57382 }
57383 }
57384
57385 return false;
57386}
57387
57388static bool onlyZeroFlagUsed(SDValue Flags) {
57389 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57390
57391 for (const SDNode *User : Flags->users()) {
57392 unsigned CCOpNo;
57393 switch (User->getOpcode()) {
57394 default:
57395 // Be conservative.
57396 return false;
57397 case X86ISD::SETCC:
57399 CCOpNo = 0;
57400 break;
57401 case X86ISD::BRCOND:
57402 case X86ISD::CMOV:
57403 CCOpNo = 2;
57404 break;
57405 }
57406
57407 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57408 if (CC != X86::COND_E && CC != X86::COND_NE)
57409 return false;
57410 }
57411
57412 return true;
57413}
57414
57417 const X86Subtarget &Subtarget) {
57418 // Only handle test patterns.
57419 if (!isNullConstant(N->getOperand(1)))
57420 return SDValue();
57421
57422 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57423 // and use its flags directly.
57424 // TODO: Maybe we should try promoting compares that only use the zero flag
57425 // first if we can prove the upper bits with computeKnownBits?
57426 SDLoc dl(N);
57427 SDValue Op = N->getOperand(0);
57428 EVT VT = Op.getValueType();
57429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57430
57431 if (SDValue CMP =
57432 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57433 return CMP;
57434
57435 // If we have a constant logical shift that's only used in a comparison
57436 // against zero turn it into an equivalent AND. This allows turning it into
57437 // a TEST instruction later.
57438 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57439 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57440 onlyZeroFlagUsed(SDValue(N, 0))) {
57441 unsigned BitWidth = VT.getSizeInBits();
57442 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57443 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57444 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57445 APInt Mask = Op.getOpcode() == ISD::SRL
57446 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57447 : APInt::getLowBitsSet(BitWidth, MaskBits);
57448 if (Mask.isSignedIntN(32)) {
57449 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57450 DAG.getConstant(Mask, dl, VT));
57451 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57452 DAG.getConstant(0, dl, VT));
57453 }
57454 }
57455 }
57456
57457 // If we're extracting from a avx512 bool vector and comparing against zero,
57458 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57459 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57460 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57461 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57462 SDValue Src = Op.getOperand(0);
57463 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57464 isNullConstant(Src.getOperand(1)) &&
57465 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57466 SDValue BoolVec = Src.getOperand(0);
57467 unsigned ShAmt = 0;
57468 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57469 ShAmt = BoolVec.getConstantOperandVal(1);
57470 BoolVec = BoolVec.getOperand(0);
57471 }
57472 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57473 EVT VecVT = BoolVec.getValueType();
57474 unsigned BitWidth = VecVT.getVectorNumElements();
57475 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57476 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57477 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57478 Op = DAG.getBitcast(BCVT, BoolVec);
57479 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57480 DAG.getConstant(Mask, dl, BCVT));
57481 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57482 DAG.getConstant(0, dl, BCVT));
57483 }
57484 }
57485 }
57486
57487 // Peek through any zero-extend if we're only testing for a zero result.
57488 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57489 SDValue Src = Op.getOperand(0);
57490 EVT SrcVT = Src.getValueType();
57491 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57492 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57493 DAG.getConstant(0, dl, SrcVT));
57494 }
57495
57496 // Look for a truncate.
57497 if (Op.getOpcode() != ISD::TRUNCATE)
57498 return SDValue();
57499
57500 SDValue Trunc = Op;
57501 Op = Op.getOperand(0);
57502
57503 // See if we can compare with zero against the truncation source,
57504 // which should help using the Z flag from many ops. Only do this for
57505 // i32 truncated op to prevent partial-reg compares of promoted ops.
57506 EVT OpVT = Op.getValueType();
57507 APInt UpperBits =
57509 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57510 onlyZeroFlagUsed(SDValue(N, 0))) {
57511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57512 DAG.getConstant(0, dl, OpVT));
57513 }
57514
57515 // After this the truncate and arithmetic op must have a single use.
57516 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57517 return SDValue();
57518
57519 unsigned NewOpc;
57520 switch (Op.getOpcode()) {
57521 default: return SDValue();
57522 case ISD::AND:
57523 // Skip and with constant. We have special handling for and with immediate
57524 // during isel to generate test instructions.
57525 if (isa<ConstantSDNode>(Op.getOperand(1)))
57526 return SDValue();
57527 NewOpc = X86ISD::AND;
57528 break;
57529 case ISD::OR: NewOpc = X86ISD::OR; break;
57530 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57531 case ISD::ADD:
57532 // If the carry or overflow flag is used, we can't truncate.
57534 return SDValue();
57535 NewOpc = X86ISD::ADD;
57536 break;
57537 case ISD::SUB:
57538 // If the carry or overflow flag is used, we can't truncate.
57540 return SDValue();
57541 NewOpc = X86ISD::SUB;
57542 break;
57543 }
57544
57545 // We found an op we can narrow. Truncate its inputs.
57546 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57547 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57548
57549 // Use a X86 specific opcode to avoid DAG combine messing with it.
57550 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57551 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57552
57553 // For AND, keep a CMP so that we can match the test pattern.
57554 if (NewOpc == X86ISD::AND)
57555 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57556 DAG.getConstant(0, dl, VT));
57557
57558 // Return the flags.
57559 return Op.getValue(1);
57560}
57561
57564 const X86Subtarget &ST) {
57565 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57566 "Expected X86ISD::ADD or X86ISD::SUB");
57567
57568 SDLoc DL(N);
57569 SDValue LHS = N->getOperand(0);
57570 SDValue RHS = N->getOperand(1);
57571 MVT VT = LHS.getSimpleValueType();
57572 bool IsSub = X86ISD::SUB == N->getOpcode();
57573 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57574
57575 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57576 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57577 return CMP;
57578
57579 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57580 if (!N->hasAnyUseOfValue(1)) {
57581 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57582 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57583 }
57584
57585 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57586 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57587 SDValue Ops[] = {N0, N1};
57588 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57589 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57590 SDValue Op(N, 0);
57591 if (Negate) {
57592 // Bail if this is only used by a user of the x86 add/sub.
57593 if (GenericAddSub->hasOneUse() &&
57594 GenericAddSub->user_begin()->isOnlyUserOf(N))
57595 return;
57596 Op = DAG.getNegative(Op, DL, VT);
57597 }
57598 DCI.CombineTo(GenericAddSub, Op);
57599 }
57600 };
57601 MatchGeneric(LHS, RHS, false);
57602 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57603
57604 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57605 // EFLAGS result doesn't change.
57606 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57607 /*ZeroSecondOpOnly*/ true);
57608}
57609
57611 SDValue LHS = N->getOperand(0);
57612 SDValue RHS = N->getOperand(1);
57613 SDValue BorrowIn = N->getOperand(2);
57614
57615 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57616 MVT VT = N->getSimpleValueType(0);
57617 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57618 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57619 }
57620
57621 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57622 // iff the flag result is dead.
57623 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57624 !N->hasAnyUseOfValue(1))
57625 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57626 LHS.getOperand(1), BorrowIn);
57627
57628 return SDValue();
57629}
57630
57631// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57634 SDValue LHS = N->getOperand(0);
57635 SDValue RHS = N->getOperand(1);
57636 SDValue CarryIn = N->getOperand(2);
57637 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57638 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57639
57640 // Canonicalize constant to RHS.
57641 if (LHSC && !RHSC)
57642 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57643 CarryIn);
57644
57645 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57646 // the result is either zero or one (depending on the input carry bit).
57647 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57648 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57649 // We don't have a good way to replace an EFLAGS use, so only do this when
57650 // dead right now.
57651 SDValue(N, 1).use_empty()) {
57652 SDLoc DL(N);
57653 EVT VT = N->getValueType(0);
57654 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57655 SDValue Res1 = DAG.getNode(
57656 ISD::AND, DL, VT,
57658 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57659 DAG.getConstant(1, DL, VT));
57660 return DCI.CombineTo(N, Res1, CarryOut);
57661 }
57662
57663 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57664 // iff the flag result is dead.
57665 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57666 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57667 SDLoc DL(N);
57668 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57669 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57670 DAG.getConstant(0, DL, LHS.getValueType()),
57671 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57672 }
57673
57674 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57675 MVT VT = N->getSimpleValueType(0);
57676 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57677 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57678 }
57679
57680 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57681 // iff the flag result is dead.
57682 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57683 !N->hasAnyUseOfValue(1))
57684 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57685 LHS.getOperand(1), CarryIn);
57686
57687 return SDValue();
57688}
57689
57691 const SDLoc &DL, EVT VT,
57692 const X86Subtarget &Subtarget) {
57693 using namespace SDPatternMatch;
57694
57695 // Example of pattern we try to detect:
57696 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57697 //(add (build_vector (extract_elt t, 0),
57698 // (extract_elt t, 2),
57699 // (extract_elt t, 4),
57700 // (extract_elt t, 6)),
57701 // (build_vector (extract_elt t, 1),
57702 // (extract_elt t, 3),
57703 // (extract_elt t, 5),
57704 // (extract_elt t, 7)))
57705
57706 if (!Subtarget.hasSSE2())
57707 return SDValue();
57708
57709 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57710 VT.getVectorNumElements() < 4 ||
57712 return SDValue();
57713
57714 SDValue Op0, Op1, Accum;
57719 m_Value(Op1))))))
57720 return SDValue();
57721
57722 // Check if one of Op0,Op1 is of the form:
57723 // (build_vector (extract_elt Mul, 0),
57724 // (extract_elt Mul, 2),
57725 // (extract_elt Mul, 4),
57726 // ...
57727 // the other is of the form:
57728 // (build_vector (extract_elt Mul, 1),
57729 // (extract_elt Mul, 3),
57730 // (extract_elt Mul, 5),
57731 // ...
57732 // and identify Mul.
57733 SDValue Mul;
57734 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57735 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57736 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57737 // TODO: Be more tolerant to undefs.
57738 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57739 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57740 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57741 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57742 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57743 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57744 return SDValue();
57745 // Commutativity of mul allows factors of a product to reorder.
57746 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57747 std::swap(Idx0L, Idx1L);
57748 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57749 std::swap(Idx0H, Idx1H);
57750 // Commutativity of add allows pairs of factors to reorder.
57751 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57752 std::swap(Idx0L, Idx0H);
57753 std::swap(Idx1L, Idx1H);
57754 }
57755 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57756 Idx1H != 2 * i + 3)
57757 return SDValue();
57758 if (!Mul) {
57759 // First time an extract_elt's source vector is visited. Must be a MUL
57760 // with 2X number of vector elements than the BUILD_VECTOR.
57761 // Both extracts must be from same MUL.
57762 Mul = Vec0L;
57763 if (Mul.getOpcode() != ISD::MUL ||
57764 Mul.getValueType().getVectorNumElements() != 2 * e)
57765 return SDValue();
57766 }
57767 // Check that the extract is from the same MUL previously seen.
57768 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57769 return SDValue();
57770 }
57771
57772 // Check if the Mul source can be safely shrunk.
57774 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57776 return SDValue();
57777
57778 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57779 VT.getVectorNumElements() * 2);
57780 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57781 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57782
57783 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57785 EVT InVT = Ops[0].getValueType();
57786 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57787 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57788 InVT.getVectorNumElements() / 2);
57789 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57790 };
57791 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57792 if (Accum)
57793 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57794 return R;
57795}
57796
57797// Attempt to turn this pattern into PMADDWD.
57798// (add (mul (sext (build_vector)), (sext (build_vector))),
57799// (mul (sext (build_vector)), (sext (build_vector)))
57801 const SDLoc &DL, EVT VT,
57802 const X86Subtarget &Subtarget) {
57803 using namespace SDPatternMatch;
57804
57805 if (!Subtarget.hasSSE2())
57806 return SDValue();
57807
57808 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57809 VT.getVectorNumElements() < 4 ||
57811 return SDValue();
57812
57813 // All inputs need to be sign extends.
57814 // TODO: Support ZERO_EXTEND from known positive?
57815 SDValue N00, N01, N10, N11;
57816 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57817 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57818 return SDValue();
57819
57820 // Must be extending from vXi16.
57821 EVT InVT = N00.getValueType();
57822 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57823 N10.getValueType() != InVT || N11.getValueType() != InVT)
57824 return SDValue();
57825
57826 // All inputs should be build_vectors.
57827 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57828 N01.getOpcode() != ISD::BUILD_VECTOR ||
57829 N10.getOpcode() != ISD::BUILD_VECTOR ||
57831 return SDValue();
57832
57833 // For each element, we need to ensure we have an odd element from one vector
57834 // multiplied by the odd element of another vector and the even element from
57835 // one of the same vectors being multiplied by the even element from the
57836 // other vector. So we need to make sure for each element i, this operator
57837 // is being performed:
57838 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57839 SDValue In0, In1;
57840 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57841 SDValue N00Elt = N00.getOperand(i);
57842 SDValue N01Elt = N01.getOperand(i);
57843 SDValue N10Elt = N10.getOperand(i);
57844 SDValue N11Elt = N11.getOperand(i);
57845 // TODO: Be more tolerant to undefs.
57846 SDValue N00In, N01In, N10In, N11In;
57847 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57848 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57849 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57850 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57851 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57852 return SDValue();
57853 // Add is commutative so indices can be reordered.
57854 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57855 std::swap(IdxN00, IdxN10);
57856 std::swap(IdxN01, IdxN11);
57857 }
57858 // N0 indices be the even element. N1 indices must be the next odd element.
57859 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57860 IdxN11 != 2 * i + 1)
57861 return SDValue();
57862
57863 // First time we find an input capture it.
57864 if (!In0) {
57865 In0 = N00In;
57866 In1 = N01In;
57867
57868 // The input vectors must be at least as wide as the output.
57869 // If they are larger than the output, we extract subvector below.
57870 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57871 In1.getValueSizeInBits() < VT.getSizeInBits())
57872 return SDValue();
57873 }
57874 // Mul is commutative so the input vectors can be in any order.
57875 // Canonicalize to make the compares easier.
57876 if (In0 != N00In)
57877 std::swap(N00In, N01In);
57878 if (In0 != N10In)
57879 std::swap(N10In, N11In);
57880 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57881 return SDValue();
57882 }
57883
57884 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57886 EVT OpVT = Ops[0].getValueType();
57887 assert(OpVT.getScalarType() == MVT::i16 &&
57888 "Unexpected scalar element type");
57889 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57890 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57891 OpVT.getVectorNumElements() / 2);
57892 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57893 };
57894
57895 // If the output is narrower than an input, extract the low part of the input
57896 // vector.
57897 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57898 VT.getVectorNumElements() * 2);
57899 if (OutVT16.bitsLT(In0.getValueType())) {
57900 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57901 DAG.getVectorIdxConstant(0, DL));
57902 }
57903 if (OutVT16.bitsLT(In1.getValueType())) {
57904 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57905 DAG.getVectorIdxConstant(0, DL));
57906 }
57907 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57908 PMADDBuilder);
57909}
57910
57911// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57912// If upper element in each pair of both VPMADDWD are zero then we can merge
57913// the operand elements and use the implicit add of VPMADDWD.
57914// TODO: Add support for VPMADDUBSW (which isn't commutable).
57916 const SDLoc &DL, EVT VT) {
57917 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57918 return SDValue();
57919
57920 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57921 if (VT.getSizeInBits() > 128)
57922 return SDValue();
57923
57924 unsigned NumElts = VT.getVectorNumElements();
57925 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57927 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57928
57929 bool Op0HiZero =
57930 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57931 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57932 bool Op1HiZero =
57933 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57934 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57935
57936 // TODO: Check for zero lower elements once we have actual codegen that
57937 // creates them.
57938 if (!Op0HiZero || !Op1HiZero)
57939 return SDValue();
57940
57941 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57942 SmallVector<int> Mask;
57943 for (int i = 0; i != (int)NumElts; ++i) {
57944 Mask.push_back(2 * i);
57945 Mask.push_back(2 * (i + NumElts));
57946 }
57947
57948 SDValue LHS =
57949 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57950 SDValue RHS =
57951 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57952 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57953}
57954
57955/// CMOV of constants requires materializing constant operands in registers.
57956/// Try to fold those constants into an 'add' instruction to reduce instruction
57957/// count. We do this with CMOV rather the generic 'select' because there are
57958/// earlier folds that may be used to turn select-of-constants into logic hacks.
57960 SelectionDAG &DAG,
57961 const X86Subtarget &Subtarget) {
57962 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57963 // better because we eliminate 1-2 instructions. This transform is still
57964 // an improvement without zero operands because we trade 2 move constants and
57965 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57966 // immediate asm operands (fit in 32-bits).
57967 auto isSuitableCmov = [](SDValue V) {
57968 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57969 return false;
57970 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57971 !isa<ConstantSDNode>(V.getOperand(1)))
57972 return false;
57973 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57974 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57975 V.getConstantOperandAPInt(1).isSignedIntN(32));
57976 };
57977
57978 // Match an appropriate CMOV as the first operand of the add.
57979 SDValue Cmov = N->getOperand(0);
57980 SDValue OtherOp = N->getOperand(1);
57981 if (!isSuitableCmov(Cmov))
57982 std::swap(Cmov, OtherOp);
57983 if (!isSuitableCmov(Cmov))
57984 return SDValue();
57985
57986 // Don't remove a load folding opportunity for the add. That would neutralize
57987 // any improvements from removing constant materializations.
57988 if (X86::mayFoldLoad(OtherOp, Subtarget))
57989 return SDValue();
57990
57991 EVT VT = N->getValueType(0);
57992 SDValue FalseOp = Cmov.getOperand(0);
57993 SDValue TrueOp = Cmov.getOperand(1);
57994
57995 // We will push the add through the select, but we can potentially do better
57996 // if we know there is another add in the sequence and this is pointer math.
57997 // In that case, we can absorb an add into the trailing memory op and avoid
57998 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57999 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58000 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58001 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58002 all_of(N->users(), [&](SDNode *Use) {
58003 auto *MemNode = dyn_cast<MemSDNode>(Use);
58004 return MemNode && MemNode->getBasePtr().getNode() == N;
58005 })) {
58006 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58007 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58008 // it is possible that choosing op1 might be better.
58009 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58010 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58011 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58012 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58013 Cmov.getOperand(2), Cmov.getOperand(3));
58014 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58015 }
58016
58017 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58018 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58019 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58020 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58021 Cmov.getOperand(3));
58022}
58023
58024// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58025// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58027 EVT VT, const X86Subtarget &Subtarget) {
58028 using namespace SDPatternMatch;
58029 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58030 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58031 return SDValue();
58032
58033 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58034 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58035 VT.getSizeInBits() < 512)
58036 return SDValue();
58037
58038 const auto TotalSize = VT.getSizeInBits();
58039 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58040 return SDValue();
58041
58042 SDValue X, Y, Acc;
58043 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58044 return SDValue();
58045
58046 KnownBits KnownX = DAG.computeKnownBits(X);
58047 if (KnownX.countMinLeadingZeros() < 12)
58048 return SDValue();
58049 KnownBits KnownY = DAG.computeKnownBits(Y);
58050 if (KnownY.countMinLeadingZeros() < 12)
58051 return SDValue();
58052 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58053 if (KnownMul.countMinLeadingZeros() < 12)
58054 return SDValue();
58055
58056 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58057 ArrayRef<SDValue> SubOps) {
58058 EVT SubVT = SubOps[0].getValueType();
58059 assert(SubVT.getScalarSizeInBits() == 64 &&
58060 "Unexpected element size, only supports 64bit size");
58061 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58062 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58063 };
58064
58065 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58066 /*CheckBWI*/ false,
58067 /*AllowAVX512*/ Subtarget.hasIFMA());
58068}
58069
58072 const X86Subtarget &Subtarget) {
58073 using namespace SDPatternMatch;
58074 EVT VT = N->getValueType(0);
58075 SDValue Op0 = N->getOperand(0);
58076 SDValue Op1 = N->getOperand(1);
58077 SDLoc DL(N);
58078
58079 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58080 return Select;
58081
58082 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58083 return MAdd;
58084 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58085 return MAdd;
58086 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58087 return MAdd;
58088
58089 // Try to synthesize horizontal adds from adds of shuffles.
58090 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58091 return V;
58092
58093 // Canonicalize hidden LEA pattern:
58094 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58095 // iff c < 4
58096 if (VT == MVT::i32 || VT == MVT::i64) {
58097 SDValue Y, Z, Shift;
58098 APInt Amt;
58099 if (sd_match(
58101 m_Shl(m_Value(), m_ConstInt(Amt))),
58102 m_Value(Y))),
58103 m_Value(Z))) &&
58104 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58105 return DAG.getNode(ISD::SUB, DL, VT,
58106 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58107 }
58108 }
58109
58110 SDValue X, Y;
58111
58112 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58113 // iff X and Y won't overflow.
58114 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58116 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58117 MVT OpVT = X.getSimpleValueType();
58118 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58119 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58120 getZeroVector(OpVT, Subtarget, DAG, DL));
58121 }
58122
58123 if (VT.isVector()) {
58124 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58126
58127 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58128 // (sub Y, (sext (vXi1 X))).
58129 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58130 // in generic DAG combine without a legal type check, but adding this there
58131 // caused regressions.
58132 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58134 m_Value(Y)))) {
58135 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58136 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58137 }
58138
58139 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58140 // canonicalisation as we don't have good vXi8 shifts.
58141 if (VT.getScalarType() == MVT::i8 &&
58143 SDValue Cmp =
58144 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58145 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58146 }
58147 }
58148
58149 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58150 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58151 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58152 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58153 if (sd_match(N, m_Add(m_Value(Accum),
58156 m_Value(Lo1)),
58158 m_Value(Hi1)))))) {
58159 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58160 concatSubVectors(Lo0, Hi0, DAG, DL),
58161 concatSubVectors(Lo1, Hi1, DAG, DL));
58162 }
58163 }
58164
58165 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58166 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58167 X86::isZeroNode(Op0.getOperand(1))) {
58168 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58169 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58170 Op0.getOperand(0), Op0.getOperand(2));
58171 }
58172
58173 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58174 return IFMA52;
58175
58176 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58177}
58178
58179// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58180// condition comes from the subtract node that produced -X. This matches the
58181// cmov expansion for absolute value. By swapping the operands we convert abs
58182// to nabs.
58183static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58184 SelectionDAG &DAG) {
58185 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58186 return SDValue();
58187
58188 SDValue Cond = N1.getOperand(3);
58189 if (Cond.getOpcode() != X86ISD::SUB)
58190 return SDValue();
58191 assert(Cond.getResNo() == 1 && "Unexpected result number");
58192
58193 SDValue FalseOp = N1.getOperand(0);
58194 SDValue TrueOp = N1.getOperand(1);
58196
58197 // ABS condition should come from a negate operation.
58198 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58199 isNullConstant(Cond.getOperand(0))) {
58200 // Get the X and -X from the negate.
58201 SDValue NegX = Cond.getValue(0);
58202 SDValue X = Cond.getOperand(1);
58203
58204 // Cmov operands should be X and NegX. Order doesn't matter.
58205 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58206 return SDValue();
58207
58208 // Build a new CMOV with the operands swapped.
58209 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58210 N1.getOperand(2), Cond);
58211 // Convert sub to add.
58212 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58213 }
58214
58215 // Handle ABD special case:
58216 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58217 // ABD condition should come from a pair of matching subtracts.
58218 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58219 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58220 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58221 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58222 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58223 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58224 // Build a new CMOV with the operands swapped.
58225 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58226 Cond);
58227 }
58228
58229 return SDValue();
58230}
58231
58233 SDValue Op0 = N->getOperand(0);
58234 SDValue Op1 = N->getOperand(1);
58235
58236 // (sub C (zero_extend (setcc)))
58237 // =>
58238 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58239 // Don't disturb (sub 0 setcc), which is easily done with neg.
58240 EVT VT = N->getValueType(0);
58241 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58242 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58243 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58244 Op1.getOperand(0).hasOneUse()) {
58245 SDValue SetCC = Op1.getOperand(0);
58248 APInt NewImm = Op0C->getAPIntValue() - 1;
58249 SDLoc DL(Op1);
58250 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58251 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58252 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58253 DAG.getConstant(NewImm, DL, VT));
58254 }
58255
58256 return SDValue();
58257}
58258
58260 if (N->getConstantOperandVal(3) != X86::COND_NE)
58261 return SDValue();
58262
58263 SDValue Sub = N->getOperand(4);
58264 if (Sub.getOpcode() != X86ISD::SUB)
58265 return SDValue();
58266
58267 SDValue Op1 = Sub.getOperand(1);
58268
58269 if (!X86::isZeroNode(Sub.getOperand(0)))
58270 return SDValue();
58271
58272 SDLoc DL(N);
58273 SmallVector<SDValue, 5> Ops(N->op_values());
58274 if (Op1.getOpcode() == X86ISD::SETCC) {
58275 // res, flags2 = sub 0, (setcc cc, flag)
58276 // cload/cstore ..., cond_ne, flag2
58277 // ->
58278 // cload/cstore cc, flag
58279 Ops[3] = Op1.getOperand(0);
58280 Ops[4] = Op1.getOperand(1);
58281 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58282 SDValue Src = Op1;
58283 SDValue Op10 = Op1.getOperand(0);
58284 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58285 // res, flags2 = sub 0, (and (xor X, -1), Y)
58286 // cload/cstore ..., cond_ne, flag2
58287 // ->
58288 // res, flags2 = sub 0, (and X, Y)
58289 // cload/cstore ..., cond_e, flag2
58290 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58291 Op1.getOperand(1));
58292 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58293 }
58294 // res, flags2 = sub 0, (and X, Y)
58295 // cload/cstore ..., cc, flag2
58296 // ->
58297 // res, flags2 = cmp (and X, Y), 0
58298 // cload/cstore ..., cc, flag2
58299 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58300 } else {
58301 return SDValue();
58302 }
58303
58304 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58305 cast<MemSDNode>(N)->getMemoryVT(),
58306 cast<MemSDNode>(N)->getMemOperand());
58307}
58308
58311 const X86Subtarget &Subtarget) {
58312 EVT VT = N->getValueType(0);
58313 SDValue Op0 = N->getOperand(0);
58314 SDValue Op1 = N->getOperand(1);
58315 SDLoc DL(N);
58316
58317 auto IsNonOpaqueConstant = [&](SDValue Op) {
58319 /*AllowOpaques*/ false);
58320 };
58321
58322 // X86 can't encode an immediate LHS of a sub. See if we can push the
58323 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58324 // one use and a constant, invert the immediate, saving one register.
58325 // However, ignore cases where C1 is 0, as those will become a NEG.
58326 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58327 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58328 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58329 Op1->hasOneUse()) {
58330 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58331 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58332 SDValue NewAdd =
58333 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58334 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58335 }
58336
58337 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58338 return V;
58339
58340 // Try to synthesize horizontal subs from subs of shuffles.
58341 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58342 return V;
58343
58344 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58345 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58346 X86::isZeroNode(Op1.getOperand(1))) {
58347 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58348 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58349 Op1.getOperand(0), Op1.getOperand(2));
58350 }
58351
58352 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58353 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58354 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58355 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58356 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58357 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58358 Op1.getOperand(1), Op1.getOperand(2));
58359 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58360 }
58361
58362 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58363 return V;
58364
58365 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58366 return V;
58367
58368 return combineSubSetcc(N, DAG);
58369}
58370
58372 const X86Subtarget &Subtarget) {
58373 unsigned Opcode = N->getOpcode();
58374 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58375 "Unknown PCMP opcode");
58376
58377 SDValue LHS = N->getOperand(0);
58378 SDValue RHS = N->getOperand(1);
58379 MVT VT = N->getSimpleValueType(0);
58380 unsigned EltBits = VT.getScalarSizeInBits();
58381 unsigned NumElts = VT.getVectorNumElements();
58382 SDLoc DL(N);
58383
58384 if (LHS == RHS)
58385 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58386 : DAG.getConstant(0, DL, VT);
58387
58388 // Constant Folding.
58389 // PCMPEQ(X,UNDEF) -> UNDEF
58390 // PCMPGT(X,UNDEF) -> 0
58391 // PCMPGT(UNDEF,X) -> 0
58392 APInt LHSUndefs, RHSUndefs;
58393 SmallVector<APInt> LHSBits, RHSBits;
58394 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58395 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58396 APInt Ones = APInt::getAllOnes(EltBits);
58397 APInt Zero = APInt::getZero(EltBits);
58398 SmallVector<APInt> Results(NumElts);
58399 for (unsigned I = 0; I != NumElts; ++I) {
58400 if (Opcode == X86ISD::PCMPEQ) {
58401 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58402 } else {
58403 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58404 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58405 }
58406 }
58407 if (Opcode == X86ISD::PCMPEQ)
58408 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58409 return getConstVector(Results, VT, DAG, DL);
58410 }
58411
58412 return SDValue();
58413}
58414
58415// Helper to determine if we can convert an integer comparison to a float
58416// comparison byt casting the operands.
58417static std::optional<unsigned>
58418CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58419 unsigned NumSignificantBitsRHS) {
58420 MVT SVT = VT.getScalarType();
58421 assert(SVT == MVT::f32 && "Only tested for float so far");
58422 const fltSemantics &Sem = SVT.getFltSemantics();
58423 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58424 "Only PCMPEQ/PCMPGT currently supported");
58425
58426 // TODO: Handle bitcastable integers.
58427
58428 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58429 // a fp value.
58430 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58431 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58432 return ISD::SINT_TO_FP;
58433
58434 return std::nullopt;
58435}
58436
58437/// Helper that combines an array of subvector ops as if they were the operands
58438/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58439/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58442 const X86Subtarget &Subtarget,
58443 unsigned Depth) {
58444 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58445 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58446
58447 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58448 return DAG.getUNDEF(VT);
58449
58450 if (llvm::all_of(Ops, [](SDValue Op) {
58451 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58452 }))
58453 return getZeroVector(VT, Subtarget, DAG, DL);
58454
58456 return SDValue(); // Limit search depth.
58457
58458 SDValue Op0 = Ops[0];
58459 bool IsSplat = llvm::all_equal(Ops);
58460 unsigned NumOps = Ops.size();
58461 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58462 LLVMContext &Ctx = *DAG.getContext();
58463
58464 // Repeated subvectors.
58465 if (IsSplat &&
58466 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58467 // If this broadcast is inserted into both halves, use a larger broadcast.
58468 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58469 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58470
58471 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58472 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58473 (Subtarget.hasAVX2() ||
58475 VT.getScalarType(), Subtarget)))
58476 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58477 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58478 Op0.getOperand(0),
58479 DAG.getVectorIdxConstant(0, DL)));
58480
58481 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58482 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58483 (Subtarget.hasAVX2() ||
58484 (EltSizeInBits >= 32 &&
58485 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58486 Op0.getOperand(0).getValueType() == VT.getScalarType())
58487 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58488
58489 // concat_vectors(extract_subvector(splat(x)),
58490 // extract_subvector(splat(x))) -> splat(x)
58491 // concat_vectors(extract_subvector(subv_broadcast(x)),
58492 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58493 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58494 Op0.getOperand(0).getValueType() == VT) {
58495 SDValue SrcVec = Op0.getOperand(0);
58496 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58497 return SrcVec;
58498 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58499 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58500 return SrcVec;
58501 }
58502
58503 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58504 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58505 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58506 return DAG.getNode(Op0.getOpcode(), DL, VT,
58508 Op0.getOperand(0), Op0.getOperand(0)),
58509 Op0.getOperand(1));
58510 }
58511
58512 // TODO: This should go in combineX86ShufflesRecursively eventually.
58513 if (NumOps == 2) {
58514 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58515 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58516 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58518 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58519 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58520 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58521 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58522 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58523 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58524 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58525 // Only concat of subvector high halves which vperm2x128 is best at or if
58526 // it should fold into a subvector broadcast.
58527 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58528 SrcVT1.is256BitVector()) {
58529 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58530 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58531 "Bad subvector index");
58532 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58533 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58534 unsigned Index = 0;
58535 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58536 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58537 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58538 DAG.getBitcast(VT, Src0.getOperand(0)),
58539 DAG.getBitcast(VT, Src1.getOperand(0)),
58540 DAG.getTargetConstant(Index, DL, MVT::i8));
58541 }
58542 }
58543 // Widen extract_subvector
58544 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58545 // --> extract_subvector(x,lo)
58546 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58547 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58548 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58549 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58550 return DAG.getBitcast(VT,
58552 Src0.getConstantOperandVal(1),
58553 DAG, DL, VT.getSizeInBits()));
58554 }
58555 }
58556 }
58557
58558 // Repeated opcode.
58559 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58560 // but it currently struggles with different vector widths.
58561 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58562 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58563 })) {
58564 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58566 for (SDValue SubOp : SubOps)
58567 Subs.push_back(SubOp.getOperand(I));
58568 // Attempt to peek through bitcasts and concat the original subvectors.
58569 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58570 if (SubVT.isSimple() && SubVT.isVector()) {
58571 MVT ConcatVT =
58573 SubVT.getVectorElementCount() * Subs.size());
58574 for (SDValue &Sub : Subs)
58575 Sub = DAG.getBitcast(SubVT, Sub);
58576 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58577 Subtarget, Depth + 1))
58578 return DAG.getBitcast(VT, ConcatSrc);
58579 return DAG.getBitcast(
58580 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58581 }
58582 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58583 };
58584 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58585 bool AllConstants = true;
58586 bool AllSubs = true;
58587 unsigned VecSize = VT.getSizeInBits();
58588 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58589 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58590 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58591 }))
58592 return true;
58593 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58594 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58595 unsigned SubSize = BC.getValueSizeInBits();
58596 unsigned EltSize = BC.getScalarValueSizeInBits();
58597 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58599 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58600 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58601 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58602 }
58603 return AllConstants || AllSubs;
58604 };
58605 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58606 bool AllConstants = true;
58608 for (SDValue SubOp : SubOps) {
58609 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58610 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58612 Subs.push_back(SubOp.getOperand(I));
58613 }
58614 if (AllConstants)
58615 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58616 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58617 };
58618
58619 unsigned Opcode = Op0.getOpcode();
58620 switch (Opcode) {
58621 case ISD::BITCAST: {
58622 // TODO: Support AVX1/AVX2 bitcasts.
58624 for (SDValue SubOp : Ops)
58625 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58626 EVT InnerVT = SubOps[0].getValueType();
58627 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58628 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58629 (Subtarget.hasBWI() ||
58630 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58631 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58632 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58633 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58634 return Op.getValueType() == InnerVT;
58635 })) {
58636 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58637 MVT ConcatVT = MVT::getVectorVT(
58638 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58639 if (SDValue ConcatSrc = combineConcatVectorOps(
58640 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58641 return DAG.getBitcast(VT, ConcatSrc);
58642 }
58643 break;
58644 }
58645 case ISD::VECTOR_SHUFFLE: {
58646 // TODO: Generalize NumOps support.
58647 if (!IsSplat && NumOps == 2 &&
58648 ((VT.is256BitVector() &&
58649 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58650 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58651 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58652 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58653 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58654 if (Concat0 || Concat1 ||
58655 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58656 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58657 Subtarget.hasVBMI())) {
58658 int NumSubElts = Op0.getValueType().getVectorNumElements();
58659 SmallVector<int> NewMask;
58660 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58661 M = M >= NumSubElts ? M + NumSubElts : M;
58662 NewMask.push_back(M);
58663 }
58664 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58665 if (0 <= M)
58666 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58667 NewMask.push_back(M);
58668 }
58669 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58670 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58671 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58672 }
58673 }
58674 break;
58675 }
58676 case X86ISD::VBROADCAST: {
58677 // TODO: 512-bit VBROADCAST concatenation.
58678 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58679 return Op.getOperand(0).getValueType().is128BitVector();
58680 })) {
58681 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58682 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58683 ConcatSubOperand(VT, Ops, 0),
58684 ConcatSubOperand(VT, Ops, 0));
58685 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58686 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58687 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58689 DL, VT, ConcatSubOperand(VT, Ops, 0),
58690 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58691 }
58692 break;
58693 }
58694 case X86ISD::MOVDDUP:
58695 case X86ISD::MOVSHDUP:
58696 case X86ISD::MOVSLDUP: {
58697 if (!IsSplat && (VT.is256BitVector() ||
58698 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58699 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58700 break;
58701 }
58702 case X86ISD::SHUFP: {
58703 if (!IsSplat &&
58704 (VT == MVT::v8f32 ||
58705 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58706 llvm::all_of(Ops, [Op0](SDValue Op) {
58707 return Op.getOperand(2) == Op0.getOperand(2);
58708 })) {
58709 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58710 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58711 if (Concat0 || Concat1)
58712 return DAG.getNode(Opcode, DL, VT,
58713 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58714 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58715 Op0.getOperand(2));
58716 }
58717 break;
58718 }
58719 case X86ISD::UNPCKH:
58720 case X86ISD::UNPCKL: {
58721 // TODO: UNPCK should use CombineSubOperand
58722 // Don't concatenate build_vector patterns.
58723 if (!IsSplat &&
58724 ((VT.is256BitVector() &&
58725 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58726 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58727 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58728 none_of(Ops, [](SDValue Op) {
58729 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58731 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58733 })) {
58734 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58735 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58736 if (Concat0 || Concat1 ||
58737 (Subtarget.hasInt256() && EltSizeInBits == 64))
58738 return DAG.getNode(Opcode, DL, VT,
58739 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58740 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58741 }
58742 break;
58743 }
58744 case X86ISD::PSHUFHW:
58745 case X86ISD::PSHUFLW:
58746 case X86ISD::PSHUFD:
58747 if (!IsSplat &&
58748 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58749 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58750 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58751 llvm::all_of(Ops, [Op0](SDValue Op) {
58752 return Op.getOperand(1) == Op0.getOperand(1);
58753 })) {
58754 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58755 Op0.getOperand(1));
58756 }
58757 [[fallthrough]];
58758 case X86ISD::VPERMILPI:
58759 if (!IsSplat && EltSizeInBits == 32 &&
58760 (VT.is256BitVector() ||
58761 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58762 all_of(Ops, [&Op0](SDValue Op) {
58763 return Op0.getOperand(1) == Op.getOperand(1);
58764 })) {
58765 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58766 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58767 Res =
58768 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58769 return DAG.getBitcast(VT, Res);
58770 }
58771 break;
58772 case X86ISD::VPERMILPV:
58773 if (!IsSplat && (VT.is256BitVector() ||
58774 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58775 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58776 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58777 if (Concat0 || Concat1)
58778 return DAG.getNode(Opcode, DL, VT,
58779 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58780 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58781 }
58782 break;
58783 case X86ISD::PSHUFB:
58784 case X86ISD::PSADBW:
58785 case X86ISD::VPMADDUBSW:
58786 case X86ISD::VPMADDWD:
58787 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58788 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58789 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58790 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58791 NumOps * SrcVT.getVectorNumElements());
58792 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58793 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58794 if (Concat0 || Concat1)
58795 return DAG.getNode(
58796 Opcode, DL, VT,
58797 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58798 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58799 }
58800 break;
58801 case X86ISD::VPERMV:
58802 // TODO: Handle 256-bit and NumOps == 4 cases.
58803 if (!IsSplat && NumOps == 2 &&
58804 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58805 MVT OpVT = Op0.getSimpleValueType();
58806 int NumSrcElts = OpVT.getVectorNumElements();
58807 SmallVector<int, 64> ConcatMask;
58808 for (unsigned i = 0; i != NumOps; ++i) {
58809 SmallVector<int, 64> SubMask;
58811 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58812 break;
58813 for (int M : SubMask) {
58814 if (0 <= M)
58815 M += i * NumSrcElts;
58816 ConcatMask.push_back(M);
58817 }
58818 }
58819 if (ConcatMask.size() == (NumOps * NumSrcElts))
58820 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58821 ConcatSubOperand(VT, Ops, 1),
58822 DAG.getUNDEF(VT), Subtarget, DAG);
58823 }
58824 break;
58825 case X86ISD::VPERMV3:
58826 // TODO: Handle 256-bit and NumOps == 4 cases.
58827 if (!IsSplat && NumOps == 2 &&
58828 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58829 MVT OpVT = Op0.getSimpleValueType();
58830 int NumSrcElts = OpVT.getVectorNumElements();
58831 SmallVector<int, 64> ConcatMask;
58832 for (unsigned i = 0; i != NumOps; ++i) {
58833 SmallVector<int, 64> SubMask;
58835 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58836 break;
58837 for (int M : SubMask) {
58838 if (0 <= M) {
58839 int Src = M < NumSrcElts ? 0 : 2;
58840 M += M < NumSrcElts ? 0 : NumSrcElts;
58841
58842 // Reference the lowest sub if the upper sub is the same.
58843 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58844 M += i * NumSrcElts;
58845 }
58846 ConcatMask.push_back(M);
58847 }
58848 }
58849 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58850 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58851 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58852 if (Concat0 || Concat1)
58853 return lowerShuffleWithPERMV(
58854 DL, VT, ConcatMask,
58855 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58856 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58857 DAG);
58858 }
58859 }
58860 break;
58861 case X86ISD::VPERM2X128: {
58862 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58863 assert(NumOps == 2 && "Bad concat_vectors operands");
58864 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58865 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58866 // TODO: Handle zero'd subvectors.
58867 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58868 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58869 (int)((Imm1 >> 4) & 0x3)};
58870 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58871 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58872 Ops[0].getOperand(1), DAG, DL);
58873 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58874 Ops[1].getOperand(1), DAG, DL);
58875 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58876 DAG.getBitcast(ShuffleVT, LHS),
58877 DAG.getBitcast(ShuffleVT, RHS),
58878 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58879 return DAG.getBitcast(VT, Res);
58880 }
58881 }
58882 break;
58883 }
58884 case X86ISD::SHUF128: {
58885 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58886 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58887 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58888 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58889 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58890 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58891 Ops[0].getOperand(1), DAG, DL);
58892 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58893 Ops[1].getOperand(1), DAG, DL);
58894 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58895 DAG.getTargetConstant(Imm, DL, MVT::i8));
58896 }
58897 break;
58898 }
58899 case ISD::TRUNCATE:
58900 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58901 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58902 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58903 SrcVT == Ops[1].getOperand(0).getValueType() &&
58904 Subtarget.useAVX512Regs() &&
58905 Subtarget.getPreferVectorWidth() >= 512 &&
58906 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58907 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58908 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58909 ConcatSubOperand(NewSrcVT, Ops, 0));
58910 }
58911 }
58912 break;
58913 case ISD::ANY_EXTEND:
58914 case ISD::SIGN_EXTEND:
58915 case ISD::ZERO_EXTEND:
58916 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58917 if (!IsSplat && NumOps == 2 &&
58918 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58919 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58920 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58921 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58922 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58923 SrcVT == Ops[1].getOperand(0).getValueType()) {
58924 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58925 return DAG.getNode(Opcode, DL, VT,
58926 ConcatSubOperand(NewSrcVT, Ops, 0));
58927 }
58928 }
58929 break;
58933 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58934 if (!IsSplat && NumOps == 2 &&
58935 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58936 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58937 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58939 Op0.getOperand(0).getValueType() ==
58940 Ops[0].getOperand(0).getValueType()) {
58941 EVT SrcVT = Op0.getOperand(0).getValueType();
58942 unsigned NumElts = VT.getVectorNumElements();
58943 MVT UnpackSVT =
58944 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58945 MVT UnpackVT =
58946 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58947 SDValue Unpack =
58948 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58949 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58950 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58951 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58952 DAG.getBitcast(SrcVT, Unpack), DAG);
58953 }
58954 break;
58955 }
58956 case X86ISD::VSHLI:
58957 case X86ISD::VSRLI:
58958 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58959 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58960 llvm::all_of(Ops, [](SDValue Op) {
58961 return Op.getConstantOperandAPInt(1) == 32;
58962 })) {
58963 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58964 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58965 Res = DAG.getBitcast(MVT::v8i32, Res);
58966 if (Opcode == X86ISD::VSHLI) {
58967 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58968 {8, 0, 8, 2, 8, 4, 8, 6});
58969 } else {
58970 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58971 {1, 8, 3, 8, 5, 8, 7, 8});
58972 }
58973 return DAG.getBitcast(VT, Res);
58974 }
58975 }
58976 [[fallthrough]];
58977 case X86ISD::VSRAI:
58978 case X86ISD::VSHL:
58979 case X86ISD::VSRL:
58980 case X86ISD::VSRA:
58981 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58982 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58983 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58984 llvm::all_of(Ops, [Op0](SDValue Op) {
58985 return Op0.getOperand(1) == Op.getOperand(1);
58986 })) {
58987 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58988 Op0.getOperand(1));
58989 }
58990 break;
58991 case X86ISD::VPERMI:
58992 case X86ISD::VROTLI:
58993 case X86ISD::VROTRI:
58994 if (!IsSplat &&
58995 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58996 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58997 llvm::all_of(Ops, [Op0](SDValue Op) {
58998 return Op0.getOperand(1) == Op.getOperand(1);
58999 })) {
59000 assert(!(Opcode == X86ISD::VPERMI &&
59001 Op0.getValueType().is128BitVector()) &&
59002 "Illegal 128-bit X86ISD::VPERMI nodes");
59003 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59004 Op0.getOperand(1));
59005 }
59006 break;
59007 case ISD::AND:
59008 case ISD::OR:
59009 case ISD::XOR:
59010 case X86ISD::ANDNP:
59011 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59012 if (!IsSplat && (VT.is256BitVector() ||
59013 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59014 // Don't concatenate root AVX1 NOT patterns.
59015 // TODO: Allow NOT folding if Concat0 succeeds.
59016 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59017 llvm::all_of(Ops, [](SDValue X) {
59018 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59019 }))
59020 break;
59021 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59022 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59023 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59024 return DAG.getNode(Opcode, DL, VT,
59025 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59026 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59027 }
59028 break;
59029 case X86ISD::PCMPEQ:
59030 case X86ISD::PCMPGT:
59031 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59032 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59033 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59034 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59035 if (Concat0 || Concat1)
59036 return DAG.getNode(Opcode, DL, VT,
59037 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59038 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59039 break;
59040 }
59041
59042 if (!IsSplat && VT == MVT::v8i32) {
59043 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59044 // TODO: Handle v4f64 as well?
59045 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59046 for (unsigned I = 0; I != NumOps; ++I) {
59047 MaxSigBitsLHS =
59048 std::max(MaxSigBitsLHS,
59049 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59050 MaxSigBitsRHS =
59051 std::max(MaxSigBitsRHS,
59052 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59053 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59054 break;
59055 }
59056
59057 ISD::CondCode ICC =
59058 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59059 ISD::CondCode FCC =
59061
59062 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59063 MVT FpVT = VT.changeVectorElementType(FpSVT);
59064
59065 if (std::optional<unsigned> CastOpc =
59066 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59067 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59068 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59069 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59070 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59071 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59072 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59073
59074 bool IsAlwaysSignaling;
59075 unsigned FSETCC =
59076 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59077 return DAG.getBitcast(
59078 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59079 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59080 }
59081 }
59082 break;
59083 case ISD::CTPOP:
59084 case ISD::CTTZ:
59085 case ISD::CTLZ:
59088 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59089 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59090 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59091 }
59092 break;
59094 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59095 if (!IsSplat &&
59096 (VT.is256BitVector() ||
59097 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59098 llvm::all_of(Ops, [Op0](SDValue Op) {
59099 return Op0.getOperand(2) == Op.getOperand(2);
59100 })) {
59101 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59102 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59103 }
59104 break;
59105 case ISD::ADD:
59106 case ISD::SUB:
59107 case ISD::MUL:
59108 // TODO: Add more integer binops?
59109 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59110 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59111 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59112 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59113 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59114 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59115 return Op.getOperand(0) == Op.getOperand(1);
59116 }))
59117 return DAG.getNode(Opcode, DL, VT,
59118 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59119 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59120 }
59121 break;
59122 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59123 // their latency are short, so here we don't replace them unless we won't
59124 // introduce extra VINSERT.
59125 case ISD::FADD:
59126 case ISD::FSUB:
59127 case ISD::FMUL:
59128 if (!IsSplat && (VT.is256BitVector() ||
59129 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59130 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59131 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59132 if (Concat0 || Concat1)
59133 return DAG.getNode(Opcode, DL, VT,
59134 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59135 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59136 }
59137 break;
59138 // Always prefer to concatenate high latency FDIV instructions.
59139 case ISD::FDIV:
59140 if (!IsSplat && (VT.is256BitVector() ||
59141 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59142 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59143 ConcatSubOperand(VT, Ops, 1));
59144 }
59145 break;
59146 case X86ISD::HADD:
59147 case X86ISD::HSUB:
59148 case X86ISD::FHADD:
59149 case X86ISD::FHSUB:
59150 if (!IsSplat && VT.is256BitVector() &&
59151 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59152 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59153 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59154 if (Concat0 || Concat1)
59155 return DAG.getNode(Opcode, DL, VT,
59156 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59157 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59158 }
59159 break;
59160 case X86ISD::PACKSS:
59161 case X86ISD::PACKUS:
59162 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59163 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59164 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59165 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59166 NumOps * SrcVT.getVectorNumElements());
59167 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59168 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59169 if (Concat0 || Concat1)
59170 return DAG.getNode(
59171 Opcode, DL, VT,
59172 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59173 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59174 }
59175 break;
59176 case X86ISD::VSHLD:
59177 case X86ISD::VSHRD:
59178 case X86ISD::PALIGNR:
59179 if (!IsSplat &&
59180 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59181 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59182 llvm::all_of(Ops, [Op0](SDValue Op) {
59183 return Op0.getOperand(2) == Op.getOperand(2);
59184 })) {
59185 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59186 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59187 if (Concat0 || Concat1)
59188 return DAG.getNode(Opcode, DL, VT,
59189 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59190 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59191 Op0.getOperand(2));
59192 }
59193 break;
59194 case X86ISD::BLENDI:
59195 if (VT.is256BitVector() && NumOps == 2 &&
59196 (EltSizeInBits >= 32 ||
59197 (Subtarget.hasInt256() &&
59198 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59199 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59200 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59201 if (Concat0 || Concat1) {
59202 unsigned NumElts = VT.getVectorNumElements();
59203 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59204 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59205 Mask = Mask.zextOrTrunc(8);
59206 return DAG.getNode(Opcode, DL, VT,
59207 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59208 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59209 DAG.getTargetConstant(Mask, DL, MVT::i8));
59210 }
59211 }
59212 // TODO: BWI targets should only use CombineSubOperand.
59213 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59214 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59215 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59216 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59217 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59218 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59219 unsigned NumElts = VT.getVectorNumElements();
59220 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59221 for (unsigned I = 1; I != NumOps; ++I)
59222 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59223 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59224 Mask = Mask.zextOrTrunc(NumMaskBits);
59225 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59226 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59227 SDValue Sel =
59228 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59229 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59230 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59231 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59232 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59233 }
59234 }
59235 break;
59236 case ISD::VSELECT:
59237 // TODO: VSELECT should use CombineSubOperand.
59238 if (!IsSplat && Subtarget.hasAVX512() &&
59239 (VT.is256BitVector() ||
59240 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59241 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59242 EVT SelVT = Ops[0].getOperand(0).getValueType();
59243 if (SelVT.getVectorElementType() == MVT::i1) {
59244 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59245 NumOps * SelVT.getVectorNumElements());
59246 if (TLI.isTypeLegal(SelVT))
59247 return DAG.getNode(
59248 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59249 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59250 }
59251 }
59252 [[fallthrough]];
59253 case X86ISD::BLENDV:
59254 // TODO: BLENDV should use CombineSubOperand.
59255 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59256 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59257 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59258 EVT SelVT = Ops[0].getOperand(0).getValueType();
59259 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59260 if (TLI.isTypeLegal(SelVT))
59261 return DAG.getNode(
59262 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59263 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59264 }
59265 break;
59266 }
59267 }
59268
59269 // Fold subvector loads into one.
59270 // If needed, look through bitcasts to get to the load.
59271 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59272 unsigned Fast;
59273 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59274 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59275 *FirstLd->getMemOperand(), &Fast) &&
59276 Fast) {
59277 if (SDValue Ld =
59278 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59279 return Ld;
59280 }
59281 }
59282
59283 // Attempt to fold target constant loads.
59284 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59285 SmallVector<APInt> EltBits;
59286 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59287 for (unsigned I = 0; I != NumOps; ++I) {
59288 APInt OpUndefElts;
59289 SmallVector<APInt> OpEltBits;
59290 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59291 OpEltBits, /*AllowWholeUndefs*/ true,
59292 /*AllowPartialUndefs*/ false))
59293 break;
59294 EltBits.append(OpEltBits);
59295 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59296 }
59297 if (EltBits.size() == VT.getVectorNumElements()) {
59298 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59299 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59300 SDValue CV = DAG.getConstantPool(C, PVT);
59303 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59304 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59306 return Ld;
59307 }
59308 }
59309
59310 // If this simple subvector or scalar/subvector broadcast_load is inserted
59311 // into both halves, use a larger broadcast_load. Update other uses to use
59312 // an extracted subvector.
59313 if (IsSplat &&
59314 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59315 if (ISD::isNormalLoad(Op0.getNode()) ||
59318 auto *Mem = cast<MemSDNode>(Op0);
59319 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59322 if (SDValue BcastLd =
59323 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59324 SDValue BcastSrc =
59325 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59326 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59327 return BcastLd;
59328 }
59329 }
59330 }
59331
59332 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59333 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59334 Subtarget.useAVX512Regs()) {
59335 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59336 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59337 Res = DAG.getBitcast(ShuffleVT, Res);
59338 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59339 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59340 return DAG.getBitcast(VT, Res);
59341 }
59342
59343 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59344 if (!IsSplat &&
59345 ((NumOps == 2 && VT == MVT::v4f64) ||
59346 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59347 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59348 // Collect the individual per-lane v2f64/v4f64 shuffles.
59349 MVT OpVT = Ops[0].getSimpleValueType();
59350 unsigned NumOpElts = OpVT.getVectorNumElements();
59353 if (all_of(seq<int>(NumOps), [&](int I) {
59354 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59355 Depth + 1) &&
59356 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59357 none_of(SrcMasks[I], isUndefOrZero) &&
59358 SrcMasks[I].size() == NumOpElts &&
59359 all_of(SrcOps[I], [&OpVT](SDValue V) {
59360 return V.getValueType() == OpVT;
59361 });
59362 })) {
59363 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59364 bool Unary = true;
59365 unsigned SHUFPDMask = 0;
59367 for (unsigned I = 0; I != NumOps; ++I) {
59368 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59369 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59370 Unary &= LHS[I] == RHS[I];
59371 for (unsigned J = 0; J != NumOpElts; ++J)
59372 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59373 }
59374 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59375 // PERMILPD mask and we can always profitably concatenate them.
59376 SDValue Concat0 =
59377 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59378 SDValue Concat1 =
59379 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59380 if (Unary || Concat0 || Concat1) {
59381 Concat0 =
59382 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59383 Concat1 =
59384 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59385 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59386 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59387 }
59388 }
59389 }
59390
59391 return SDValue();
59392}
59393
59396 const X86Subtarget &Subtarget) {
59397 EVT VT = N->getValueType(0);
59398 EVT SrcVT = N->getOperand(0).getValueType();
59399 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59401
59402 if (VT.getVectorElementType() == MVT::i1) {
59403 // Attempt to constant fold.
59404 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59406 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59408 if (!C) break;
59409 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59410 if (I == (E - 1)) {
59411 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59412 if (TLI.isTypeLegal(IntVT))
59413 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59414 }
59415 }
59416
59417 // Don't do anything else for i1 vectors.
59418 return SDValue();
59419 }
59420
59421 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59422 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59423 Subtarget))
59424 return R;
59425 }
59426
59427 return SDValue();
59428}
59429
59432 const X86Subtarget &Subtarget) {
59433 if (DCI.isBeforeLegalizeOps())
59434 return SDValue();
59435
59436 MVT OpVT = N->getSimpleValueType(0);
59437
59438 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59439
59440 SDLoc dl(N);
59441 SDValue Vec = N->getOperand(0);
59442 SDValue SubVec = N->getOperand(1);
59443
59444 uint64_t IdxVal = N->getConstantOperandVal(2);
59445 MVT SubVecVT = SubVec.getSimpleValueType();
59446 int VecNumElts = OpVT.getVectorNumElements();
59447 int SubVecNumElts = SubVecVT.getVectorNumElements();
59448
59449 if (Vec.isUndef() && SubVec.isUndef())
59450 return DAG.getUNDEF(OpVT);
59451
59452 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59453 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59454 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59455 return getZeroVector(OpVT, Subtarget, DAG, dl);
59456
59458 // If we're inserting into a zero vector and then into a larger zero vector,
59459 // just insert into the larger zero vector directly.
59460 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59462 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59463 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59464 getZeroVector(OpVT, Subtarget, DAG, dl),
59465 SubVec.getOperand(1),
59466 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59467 }
59468
59469 // If we're inserting into a zero vector and our input was extracted from an
59470 // insert into a zero vector of the same type and the extraction was at
59471 // least as large as the original insertion. Just insert the original
59472 // subvector into a zero vector.
59473 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59474 isNullConstant(SubVec.getOperand(1)) &&
59476 SDValue Ins = SubVec.getOperand(0);
59477 if (isNullConstant(Ins.getOperand(2)) &&
59478 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59479 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59480 SubVecVT.getFixedSizeInBits())
59481 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59482 getZeroVector(OpVT, Subtarget, DAG, dl),
59483 Ins.getOperand(1), N->getOperand(2));
59484 }
59485 }
59486
59487 // Stop here if this is an i1 vector.
59488 if (IsI1Vector)
59489 return SDValue();
59490
59491 // Eliminate an intermediate vector widening:
59492 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59493 // insert_subvector X, Y, Idx
59494 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59495 // there?
59496 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59497 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59498 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59499 SubVec.getOperand(1), N->getOperand(2));
59500
59501 // If this is an insert of an extract, combine to a shuffle. Don't do this
59502 // if the insert or extract can be represented with a subregister operation.
59503 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59504 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59505 (IdxVal != 0 ||
59506 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59507 SDValue ExtSrc = SubVec.getOperand(0);
59508 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59509 // Create a shuffle mask matching the extraction and insertion.
59510 SmallVector<int, 64> Mask(VecNumElts);
59511 std::iota(Mask.begin(), Mask.end(), 0);
59512 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59513 ExtIdxVal + VecNumElts);
59514 if (ExtIdxVal != 0)
59515 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59516 // See if we can use a blend instead of extract/insert pair.
59517 SmallVector<int, 64> BlendMask(VecNumElts);
59518 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59519 std::iota(BlendMask.begin() + IdxVal,
59520 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59521 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59522 VecNumElts == (2 * SubVecNumElts)) {
59523 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59524 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59525 SDValue Blend = DAG.getNode(
59526 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59527 DAG.getBitcast(MVT::v8f32, ExtSrc),
59528 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59529 return DAG.getBitcast(OpVT, Blend);
59530 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59531 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59532 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59533 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59534 SDValue Shuffle =
59535 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59536 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59537 return DAG.getBitcast(OpVT, Shuffle);
59538 }
59539 }
59540 }
59541
59542 // Match concat_vector style patterns.
59543 SmallVector<SDValue, 2> SubVectorOps;
59544 if (collectConcatOps(N, SubVectorOps, DAG)) {
59545 if (SDValue Fold =
59546 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59547 return Fold;
59548
59549 // If we're inserting all zeros into the upper half, change this to
59550 // a concat with zero. We will match this to a move
59551 // with implicit upper bit zeroing during isel.
59552 // We do this here because we don't want combineConcatVectorOps to
59553 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59554 if (SubVectorOps.size() == 2 &&
59555 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59556 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59557 getZeroVector(OpVT, Subtarget, DAG, dl),
59558 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59559
59560 // Attempt to recursively combine to a shuffle.
59561 if (all_of(SubVectorOps, [](SDValue SubOp) {
59563 })) {
59564 SDValue Op(N, 0);
59565 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59566 return Res;
59567 }
59568 }
59569
59570 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59571 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59572 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59573
59574 // If this is a broadcast load inserted into an upper undef, use a larger
59575 // broadcast load.
59576 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59577 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59578 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59580 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59581 }
59582
59583 // If we're splatting the lower half subvector of a full vector load into the
59584 // upper half, attempt to create a subvector broadcast.
59585 if ((int)IdxVal == (VecNumElts / 2) &&
59586 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59587 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59588 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59589 if (VecLd && SubLd &&
59591 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59593 SubVecVT, SubLd, 0, DAG);
59594 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59595 BcastLd, DAG.getVectorIdxConstant(0, dl));
59596 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59597 return BcastLd;
59598 }
59599 }
59600
59601 // Attempt to constant fold (if we're not widening).
59602 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59603 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59604 APInt VecUndefElts, SubUndefElts;
59605 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59606 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59607 VecEltBits) &&
59608 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59609 SubEltBits)) {
59610 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59611 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59612 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59613 }
59614 }
59615
59616 // Attempt to recursively combine to a shuffle.
59619 SDValue Op(N, 0);
59620 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59621 return Res;
59622 }
59623
59624 // Match insertion of subvector load that perfectly aliases a base load.
59625 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59626 ISD::isNormalLoad(SubVec.getNode()) &&
59628 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59629 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59630 return Vec;
59631
59632 return SDValue();
59633}
59634
59635/// If we are extracting a subvector of a vector select and the select condition
59636/// is composed of concatenated vectors, try to narrow the select width. This
59637/// is a common pattern for AVX1 integer code because 256-bit selects may be
59638/// legal, but there is almost no integer math/logic available for 256-bit.
59639/// This function should only be called with legal types (otherwise, the calls
59640/// to get simple value types will assert).
59642 SelectionDAG &DAG) {
59643 SDValue Sel = Ext->getOperand(0);
59644 if (Sel.getOpcode() != ISD::VSELECT ||
59645 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59646 return SDValue();
59647
59648 // Note: We assume simple value types because this should only be called with
59649 // legal operations/types.
59650 // TODO: This can be extended to handle extraction to 256-bits.
59651 MVT VT = Ext->getSimpleValueType(0);
59652 if (!VT.is128BitVector())
59653 return SDValue();
59654
59655 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59656 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59657 return SDValue();
59658
59659 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59660 MVT SelVT = Sel.getSimpleValueType();
59661 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59662 "Unexpected vector type with legal operations");
59663
59664 unsigned SelElts = SelVT.getVectorNumElements();
59665 unsigned CastedElts = WideVT.getVectorNumElements();
59666 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59667 if (SelElts % CastedElts == 0) {
59668 // The select has the same or more (narrower) elements than the extract
59669 // operand. The extraction index gets scaled by that factor.
59670 ExtIdx *= (SelElts / CastedElts);
59671 } else if (CastedElts % SelElts == 0) {
59672 // The select has less (wider) elements than the extract operand. Make sure
59673 // that the extraction index can be divided evenly.
59674 unsigned IndexDivisor = CastedElts / SelElts;
59675 if (ExtIdx % IndexDivisor != 0)
59676 return SDValue();
59677 ExtIdx /= IndexDivisor;
59678 } else {
59679 llvm_unreachable("Element count of simple vector types are not divisible?");
59680 }
59681
59682 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59683 unsigned NarrowElts = SelElts / NarrowingFactor;
59684 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59685 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59686 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59687 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59688 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59689 return DAG.getBitcast(VT, NarrowSel);
59690}
59691
59694 const X86Subtarget &Subtarget) {
59695 if (!N->getValueType(0).isSimple())
59696 return SDValue();
59697
59698 MVT VT = N->getSimpleValueType(0);
59699 SDValue InVec = N->getOperand(0);
59700 unsigned IdxVal = N->getConstantOperandVal(1);
59701 EVT InVecVT = InVec.getValueType();
59702 unsigned SizeInBits = VT.getSizeInBits();
59703 unsigned InSizeInBits = InVecVT.getSizeInBits();
59704 unsigned NumSubElts = VT.getVectorNumElements();
59705 unsigned NumInElts = InVecVT.getVectorNumElements();
59706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59707 SDLoc DL(N);
59708
59709 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59710 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59711 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59712 // We let generic combining take over from there to simplify the
59713 // insert/extract and 'not'.
59714 // This pattern emerges during AVX1 legalization. We handle it before lowering
59715 // to avoid complications like splitting constant vector loads.
59716 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59717 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59718 auto isConcatenatedNot = [](SDValue V) {
59719 V = peekThroughBitcasts(V);
59720 if (!isBitwiseNot(V))
59721 return false;
59722 SDValue NotOp = V->getOperand(0);
59724 };
59725 if (isConcatenatedNot(InVec.getOperand(0)) ||
59726 isConcatenatedNot(InVec.getOperand(1))) {
59727 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59728 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59729 splitVectorIntBinary(InVec, DAG, DL),
59730 N->getOperand(1));
59731 }
59732 }
59733
59734 if (DCI.isBeforeLegalizeOps())
59735 return SDValue();
59736
59737 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59738 return V;
59739
59741 return getZeroVector(VT, Subtarget, DAG, DL);
59742
59743 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59744 if (VT.getScalarType() == MVT::i1)
59745 return DAG.getConstant(1, DL, VT);
59746 return getOnesVector(VT, DAG, DL);
59747 }
59748
59749 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59750 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59751
59752 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59753 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59754 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59755 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59756 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59757 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59758 }
59759
59760 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59761 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59762 // iff SUB is entirely contained in the extraction.
59763 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59764 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59765 SDValue Src = InVec.getOperand(0);
59766 SDValue Sub = InVec.getOperand(1);
59767 EVT SubVT = Sub.getValueType();
59768 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59769 if (IdxVal <= InsIdx &&
59770 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59771 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59772 DAG.getVectorIdxConstant(IdxVal, DL));
59773 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59774 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59775 }
59776 }
59777
59778 // If we're extracting an upper subvector see if we'd get the same elements if
59779 // we extracted the lowest subvector instead which should allow
59780 // SimplifyDemandedVectorElts do more simplifications.
59781 if (IdxVal != 0) {
59782 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59783 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59784 });
59785 if (AllEquiv)
59786 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59787 }
59788
59789 // Check if we're extracting a whole broadcasted subvector.
59790 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59791 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59792 EVT MemVT = MemIntr->getMemoryVT();
59793 if (MemVT == VT) {
59794 // If this is the only use, we can replace with a regular load (this may
59795 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59796 // memory chain).
59797 if (InVec.hasOneUse()) {
59798 SDValue Ld =
59799 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59800 MemIntr->getMemOperand());
59801 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59802 return Ld;
59803 }
59804 }
59805 }
59806
59807 // Attempt to extract from the source of a shuffle vector.
59808 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59809 SmallVector<int, 32> ShuffleMask;
59810 SmallVector<int, 32> ScaledMask;
59811 SmallVector<SDValue, 2> ShuffleInputs;
59812 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59813 // Decode the shuffle mask and scale it so its shuffling subvectors.
59814 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59815 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59816 unsigned SubVecIdx = IdxVal / NumSubElts;
59817 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59818 return DAG.getUNDEF(VT);
59819 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59820 return getZeroVector(VT, Subtarget, DAG, DL);
59821 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59822 if (Src.getValueSizeInBits() == InSizeInBits) {
59823 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59824 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59825 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59826 DL, SizeInBits);
59827 }
59828 }
59829 }
59830
59831 auto IsExtractFree = [](SDValue V) {
59832 if (V.hasOneUse()) {
59834 if (V.getOpcode() == ISD::LOAD)
59835 return true;
59836 }
59837 V = peekThroughBitcasts(V);
59838 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59839 return true;
59841 return true;
59842 return V.isUndef();
59843 };
59844
59845 // If we're extracting the lowest subvector and we're the only user,
59846 // we may be able to perform this with a smaller vector width.
59847 unsigned InOpcode = InVec.getOpcode();
59848 if (InVec.hasOneUse()) {
59849 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59850 // v2f64 CVTDQ2PD(v4i32).
59851 if (InOpcode == ISD::SINT_TO_FP &&
59852 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59853 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59854 }
59855 // v2f64 CVTUDQ2PD(v4i32).
59856 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59857 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59858 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59859 }
59860 // v2f64 CVTPS2PD(v4f32).
59861 if (InOpcode == ISD::FP_EXTEND &&
59862 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59863 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59864 }
59865 }
59866 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59867 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59868 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59869 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59870 Subtarget.hasVLX())) &&
59871 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59872 SDValue Src = InVec.getOperand(0);
59873 if (Src.getValueType().getScalarSizeInBits() == 32)
59874 return DAG.getNode(InOpcode, DL, VT,
59875 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59876 }
59877 if (IdxVal == 0 &&
59878 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59879 (SizeInBits == 128 || SizeInBits == 256) &&
59880 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59881 SDValue Ext = InVec.getOperand(0);
59882 if (Ext.getValueSizeInBits() > SizeInBits)
59883 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59884 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59885 return DAG.getNode(ExtOp, DL, VT, Ext);
59886 }
59887 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59888 InVec.getOperand(0).getValueType().is256BitVector() &&
59889 InVec.getOperand(1).getValueType().is256BitVector() &&
59890 InVec.getOperand(2).getValueType().is256BitVector()) {
59891 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59892 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59893 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59894 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59895 }
59896 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59897 (SizeInBits == 128 || SizeInBits == 256)) {
59898 SDValue InVecSrc = InVec.getOperand(0);
59899 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59900 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59901 return DAG.getNode(InOpcode, DL, VT, Ext);
59902 }
59903
59904 if (SizeInBits == 128 || SizeInBits == 256) {
59905 switch (InOpcode) {
59906 case X86ISD::MOVDDUP:
59907 return DAG.getNode(
59908 InOpcode, DL, VT,
59909 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59910 case X86ISD::PSHUFD:
59911 case X86ISD::VPERMILPI:
59912 if (InVec.getOperand(0).hasOneUse()) {
59913 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59914 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59915 return DAG.getNode(InOpcode, DL, VT,
59916 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59917 DL, SizeInBits),
59918 DAG.getTargetConstant(M, DL, MVT::i8));
59919 }
59920 break;
59921 case X86ISD::PCMPEQ:
59922 case X86ISD::PCMPGT:
59923 case X86ISD::UNPCKH:
59924 case X86ISD::UNPCKL:
59925 if (IsExtractFree(InVec.getOperand(0)) ||
59926 IsExtractFree(InVec.getOperand(1)))
59927 return DAG.getNode(InOpcode, DL, VT,
59928 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59929 DL, SizeInBits),
59930 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59931 DL, SizeInBits));
59932 break;
59933 case X86ISD::CMPP:
59934 if (IsExtractFree(InVec.getOperand(0)) ||
59935 IsExtractFree(InVec.getOperand(1)))
59936 return DAG.getNode(InOpcode, DL, VT,
59937 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59938 DL, SizeInBits),
59939 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59940 DL, SizeInBits),
59941 InVec.getOperand(2));
59942 break;
59943 case X86ISD::BLENDI:
59944 if (IsExtractFree(InVec.getOperand(0)) ||
59945 IsExtractFree(InVec.getOperand(1))) {
59946 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59947 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59948 return DAG.getNode(InOpcode, DL, VT,
59949 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59950 DL, SizeInBits),
59951 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59952 DL, SizeInBits),
59953 DAG.getTargetConstant(M, DL, MVT::i8));
59954 }
59955 break;
59956 case X86ISD::VPERMV:
59957 if (IdxVal != 0) {
59958 SDValue Mask = InVec.getOperand(0);
59959 SDValue Src = InVec.getOperand(1);
59960 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59961 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59962 DL, InSizeInBits);
59963 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59964 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59965 }
59966 break;
59967 case X86ISD::VPERMV3:
59968 if (IdxVal != 0) {
59969 SDValue Src0 = InVec.getOperand(0);
59970 SDValue Mask = InVec.getOperand(1);
59971 SDValue Src1 = InVec.getOperand(2);
59972 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59973 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59974 DL, InSizeInBits);
59975 SDValue Shuffle =
59976 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59977 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59978 }
59979 break;
59980 }
59981 }
59982 }
59983
59984 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59985 // as this is very likely to fold into a shuffle/truncation.
59986 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59987 InVecVT.getScalarSizeInBits() == 64 &&
59988 InVec.getConstantOperandAPInt(1) == 32) {
59989 SDValue Ext =
59990 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59991 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59992 }
59993
59994 return SDValue();
59995}
59996
59998 const X86Subtarget &Subtarget) {
59999 using namespace SDPatternMatch;
60000 EVT VT = N->getValueType(0);
60001 SDValue Src = N->getOperand(0);
60002 SDLoc DL(N);
60003
60004 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60005 // This occurs frequently in our masked scalar intrinsic code and our
60006 // floating point select lowering with AVX512.
60007 // TODO: SimplifyDemandedBits instead?
60008 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60009 isOneConstant(Src.getOperand(1)))
60010 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60011
60012 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60013 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60014 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60015 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60016 isNullConstant(Src.getOperand(1)))
60017 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60018 Src.getOperand(1));
60019
60020 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60021 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60022 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60023 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60024 if (Op.getValueType() != MVT::i64)
60025 return SDValue();
60026 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60027 if (Op.getOpcode() == Opc &&
60028 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60029 return Op.getOperand(0);
60030 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60031 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60032 if (Ld->getExtensionType() == Ext &&
60033 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60034 return Op;
60035 if (IsZeroExt) {
60036 KnownBits Known = DAG.computeKnownBits(Op);
60037 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60038 return Op;
60039 }
60040 return SDValue();
60041 };
60042
60043 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60044 return DAG.getBitcast(
60045 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60046 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60047
60048 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60049 return DAG.getBitcast(
60050 VT,
60051 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60052 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60053 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60054 }
60055
60056 if (Src.getOpcode() == ISD::BITCAST) {
60057 SDValue SrcOp = Src.getOperand(0);
60058 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60059 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60060 return DAG.getBitcast(
60061 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60062 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60063 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60064 return DAG.getBitcast(
60065 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60066 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60067 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60068 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60069 }
60070
60071 if (VT == MVT::v4i32) {
60072 SDValue HalfSrc;
60073 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60074 // to remove XMM->GPR->XMM moves.
60075 if (sd_match(Src, m_AnyExt(m_BitCast(
60076 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60077 return DAG.getBitcast(
60078 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60079 }
60080
60081 // See if we're broadcasting the scalar value, in which case just reuse that.
60082 // Ensure the same SDValue from the SDNode use is being used.
60083 if (VT.getScalarType() == Src.getValueType())
60084 for (SDNode *User : Src->users())
60085 if (User->getOpcode() == X86ISD::VBROADCAST &&
60086 Src == User->getOperand(0)) {
60087 unsigned SizeInBits = VT.getFixedSizeInBits();
60088 unsigned BroadcastSizeInBits =
60089 User->getValueSizeInBits(0).getFixedValue();
60090 if (BroadcastSizeInBits == SizeInBits)
60091 return SDValue(User, 0);
60092 if (BroadcastSizeInBits > SizeInBits)
60093 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60094 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60095 // coverage.
60096 }
60097
60098 // Check for cases where we've ended up with a scalarized shift, typically
60099 // during type legalization.
60100 switch (Src.getOpcode()) {
60101 case ISD::SHL:
60102 case ISD::SRL:
60103 case ISD::SRA:
60104 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60105 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60106 Src.hasOneUse()) {
60107 SDValue SrcVec =
60108 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60109 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60110 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60111 Amt->getZExtValue(), DAG);
60112 }
60113 }
60114 break;
60115 case ISD::FSHL:
60116 case ISD::FSHR:
60117 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60118 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60119 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60120 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60121 Src.hasOneUse()) {
60122 uint64_t AmtVal =
60123 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60124 SDValue SrcVec0 =
60125 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60126 SDValue SrcVec1 =
60127 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60128 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60129 DAG.getConstant(AmtVal, DL, VT));
60130 }
60131 }
60132 break;
60133 }
60134
60135 return SDValue();
60136}
60137
60138// Simplify PMULDQ and PMULUDQ operations.
60141 const X86Subtarget &Subtarget) {
60142 SDValue LHS = N->getOperand(0);
60143 SDValue RHS = N->getOperand(1);
60144
60145 // Canonicalize constant to RHS.
60148 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60149
60150 // Multiply by zero.
60151 // Don't return RHS as it may contain UNDEFs.
60152 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60153 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60154
60155 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60156 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60157 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60158 return SDValue(N, 0);
60159
60160 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60161 // convert it to any_extend_invec, due to the LegalOperations check, do the
60162 // conversion directly to a vector shuffle manually. This exposes combine
60163 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60164 // combineX86ShufflesRecursively on SSE4.1 targets.
60165 // FIXME: This is basically a hack around several other issues related to
60166 // ANY_EXTEND_VECTOR_INREG.
60167 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60168 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60169 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60170 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60171 SDLoc dl(N);
60172 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60173 LHS.getOperand(0), { 0, -1, 1, -1 });
60174 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60175 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60176 }
60177 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60178 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60179 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60180 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60181 SDLoc dl(N);
60182 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60183 RHS.getOperand(0), { 0, -1, 1, -1 });
60184 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60185 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60186 }
60187
60188 return SDValue();
60189}
60190
60191// Simplify VPMADDUBSW/VPMADDWD operations.
60194 MVT VT = N->getSimpleValueType(0);
60195 SDValue LHS = N->getOperand(0);
60196 SDValue RHS = N->getOperand(1);
60197 unsigned Opc = N->getOpcode();
60198 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60200 "Unexpected PMADD opcode");
60201
60202 // Multiply by zero.
60203 // Don't return LHS/RHS as it may contain UNDEFs.
60204 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60206 return DAG.getConstant(0, SDLoc(N), VT);
60207
60208 // Constant folding.
60209 APInt LHSUndefs, RHSUndefs;
60210 SmallVector<APInt> LHSBits, RHSBits;
60211 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60212 unsigned DstEltBits = VT.getScalarSizeInBits();
60213 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60214 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60215 SmallVector<APInt> Result;
60216 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60217 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60218 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60219 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60220 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60221 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60222 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60223 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60224 Result.push_back(Res);
60225 }
60226 return getConstVector(Result, VT, DAG, SDLoc(N));
60227 }
60228
60229 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60230 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60231 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60232 return SDValue(N, 0);
60233
60234 return SDValue();
60235}
60236
60237// Simplify VPMADD52L/VPMADD52H operations.
60240 MVT VT = N->getSimpleValueType(0);
60241
60242 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60243 SDValue Op0 = N->getOperand(0);
60244 SDValue Op1 = N->getOperand(1);
60245 SDValue Op2 = N->getOperand(2);
60246 SDLoc DL(N);
60247
60248 APInt C0, C1;
60249 bool HasC0 = X86::isConstantSplat(Op0, C0),
60250 HasC1 = X86::isConstantSplat(Op1, C1);
60251
60252 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60253 if (HasC0 && !HasC1)
60254 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60255
60256 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60257 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60258 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60259 if (KnownOp0.countMinLeadingZeros() >= 12)
60260 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60261 }
60262
60263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60264 unsigned NumEltBits = VT.getScalarSizeInBits();
60265 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60266 DCI))
60267 return SDValue(N, 0);
60268
60269 return SDValue();
60270}
60271
60274 const X86Subtarget &Subtarget) {
60275 EVT VT = N->getValueType(0);
60276 SDValue In = N->getOperand(0);
60277 unsigned Opcode = N->getOpcode();
60278 unsigned InOpcode = In.getOpcode();
60279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60280 SDLoc DL(N);
60281
60282 // Try to merge vector loads and extend_inreg to an extload.
60283 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60284 In.hasOneUse()) {
60285 auto *Ld = cast<LoadSDNode>(In);
60286 if (Ld->isSimple()) {
60287 MVT SVT = In.getSimpleValueType().getVectorElementType();
60290 : ISD::ZEXTLOAD;
60291 EVT MemVT = VT.changeVectorElementType(SVT);
60292 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60293 SDValue Load = DAG.getExtLoad(
60294 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60295 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60296 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60297 return Load;
60298 }
60299 }
60300 }
60301
60302 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60303 if (Opcode == InOpcode)
60304 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60305
60306 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60307 // -> EXTEND_VECTOR_INREG(X).
60308 // TODO: Handle non-zero subvector indices.
60309 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60310 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60311 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60312 In.getValueSizeInBits())
60313 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60314
60315 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60316 // TODO: Move to DAGCombine?
60317 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60318 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60319 In.getValueSizeInBits() == VT.getSizeInBits()) {
60320 unsigned NumElts = VT.getVectorNumElements();
60321 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60322 EVT EltVT = In.getOperand(0).getValueType();
60323 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60324 for (unsigned I = 0; I != NumElts; ++I)
60325 Elts[I * Scale] = In.getOperand(I);
60326 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60327 }
60328
60329 // Attempt to combine as a shuffle on SSE41+ targets.
60330 if (Subtarget.hasSSE41()) {
60331 SDValue Op(N, 0);
60332 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60333 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60334 return Res;
60335 }
60336
60337 return SDValue();
60338}
60339
60342 EVT VT = N->getValueType(0);
60343 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60344 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60345 return DAG.getConstant(0, SDLoc(N), VT);
60346
60347 // Fold kshiftr(extract_subvector(X,C1),C2)
60348 // --> extract_subvector(kshiftr(X,C1+C2),0)
60349 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60350 if (N->getOpcode() == X86ISD::KSHIFTR) {
60351 SDLoc DL(N);
60352 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60353 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60354 SDValue Src = N->getOperand(0).getOperand(0);
60355 uint64_t Amt = N->getConstantOperandVal(1) +
60356 N->getOperand(0).getConstantOperandVal(1);
60357 EVT SrcVT = Src.getValueType();
60358 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60359 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60360 DAG.getTargetConstant(Amt, DL, MVT::i8));
60361 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60362 DAG.getVectorIdxConstant(0, DL));
60363 }
60364 }
60365 }
60366
60367 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60368 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60369 return SDValue(N, 0);
60370
60371 return SDValue();
60372}
60373
60374// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60375// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60376// extra instructions between the conversion due to going to scalar and back.
60378 const X86Subtarget &Subtarget) {
60379 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60380 return SDValue();
60381
60382 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60383 return SDValue();
60384
60385 if (N->getValueType(0) != MVT::f32 ||
60386 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60387 return SDValue();
60388
60389 SDLoc dl(N);
60390 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60391 N->getOperand(0).getOperand(0));
60392 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60393 DAG.getTargetConstant(4, dl, MVT::i32));
60394 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60395 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60396 DAG.getVectorIdxConstant(0, dl));
60397}
60398
60401 const X86Subtarget &Subtarget) {
60402 EVT VT = N->getValueType(0);
60403 bool IsStrict = N->isStrictFPOpcode();
60404 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60405 EVT SrcVT = Src.getValueType();
60406
60407 SDLoc dl(N);
60408 if (SrcVT.getScalarType() == MVT::bf16) {
60409 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60410 !IsStrict && Src.getOperand(0).getValueType() == VT)
60411 return Src.getOperand(0);
60412
60413 if (!SrcVT.isVector())
60414 return SDValue();
60415
60416 assert(!IsStrict && "Strict FP doesn't support BF16");
60417 if (VT.getVectorElementType() == MVT::f64) {
60418 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60419 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60420 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60421 }
60422 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60423 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60424 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60425 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60426 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60427 return DAG.getBitcast(VT, Src);
60428 }
60429
60430 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60431 return SDValue();
60432
60433 if (Subtarget.hasFP16())
60434 return SDValue();
60435
60436 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60437 return SDValue();
60438
60439 if (VT.getVectorElementType() != MVT::f32 &&
60440 VT.getVectorElementType() != MVT::f64)
60441 return SDValue();
60442
60443 unsigned NumElts = VT.getVectorNumElements();
60444 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60445 return SDValue();
60446
60447 // Convert the input to vXi16.
60448 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60449 Src = DAG.getBitcast(IntVT, Src);
60450
60451 // Widen to at least 8 input elements.
60452 if (NumElts < 8) {
60453 unsigned NumConcats = 8 / NumElts;
60454 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60455 : DAG.getConstant(0, dl, IntVT);
60456 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60457 Ops[0] = Src;
60458 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60459 }
60460
60461 // Destination is vXf32 with at least 4 elements.
60462 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60463 std::max(4U, NumElts));
60464 SDValue Cvt, Chain;
60465 if (IsStrict) {
60466 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60467 {N->getOperand(0), Src});
60468 Chain = Cvt.getValue(1);
60469 } else {
60470 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60471 }
60472
60473 if (NumElts < 4) {
60474 assert(NumElts == 2 && "Unexpected size");
60475 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60476 DAG.getVectorIdxConstant(0, dl));
60477 }
60478
60479 if (IsStrict) {
60480 // Extend to the original VT if necessary.
60481 if (Cvt.getValueType() != VT) {
60482 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60483 {Chain, Cvt});
60484 Chain = Cvt.getValue(1);
60485 }
60486 return DAG.getMergeValues({Cvt, Chain}, dl);
60487 }
60488
60489 // Extend to the original VT if necessary.
60490 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60491}
60492
60493// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60496 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60497 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60498 "Unknown broadcast load type");
60499
60500 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60501 SDValue Ptr = MemIntrin->getBasePtr();
60502 SDValue Chain = MemIntrin->getChain();
60503 EVT VT = N->getSimpleValueType(0);
60504 EVT MemVT = MemIntrin->getMemoryVT();
60505
60506 // Look at other users of our base pointer and try to find a wider broadcast.
60507 // The input chain and the size of the memory VT must match.
60508 for (SDNode *User : Ptr->users())
60509 if (User != N && User->getOpcode() == N->getOpcode() &&
60510 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60511 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60512 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60513 MemVT.getSizeInBits() &&
60514 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60516 MemIntrin->isSimple() && "Illegal broadcast load type");
60518 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60519 VT.getSizeInBits());
60520 Extract = DAG.getBitcast(VT, Extract);
60521 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60522 return Extract;
60523 }
60524
60525 return SDValue();
60526}
60527
60529 const X86Subtarget &Subtarget) {
60530 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60531 return SDValue();
60532
60533 bool IsStrict = N->isStrictFPOpcode();
60534 EVT VT = N->getValueType(0);
60535 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60536 EVT SrcVT = Src.getValueType();
60537
60538 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60539 SrcVT.getVectorElementType() != MVT::f32)
60540 return SDValue();
60541
60542 SDLoc dl(N);
60543
60544 SDValue Cvt, Chain;
60545 unsigned NumElts = VT.getVectorNumElements();
60546 if (Subtarget.hasFP16()) {
60547 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60548 // v4f32 (xint_to_fp v4i64))))
60549 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60550 // v8f16 (CVTXI2P v4i64)))
60551 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60552 Src.getNumOperands() == 2) {
60553 SDValue Cvt0, Cvt1;
60554 SDValue Op0 = Src.getOperand(0);
60555 SDValue Op1 = Src.getOperand(1);
60556 bool IsOp0Strict = Op0->isStrictFPOpcode();
60557 if (Op0.getOpcode() != Op1.getOpcode() ||
60558 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60559 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60560 return SDValue();
60561 }
60562 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60563 if (IsStrict) {
60564 assert(IsOp0Strict && "Op0 must be strict node");
60565 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60568 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60569 {Op0.getOperand(0), Op0.getOperand(1)});
60570 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60571 {Op1.getOperand(0), Op1.getOperand(1)});
60572 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60573 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60574 }
60575 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60577 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60578 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60579 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60580 }
60581 return SDValue();
60582 }
60583
60584 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60585 return SDValue();
60586
60587 // Widen to at least 4 input elements.
60588 if (NumElts < 4)
60589 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60590 DAG.getConstantFP(0.0, dl, SrcVT));
60591
60592 // Destination is v8i16 with at least 8 elements.
60593 EVT CvtVT =
60594 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60595 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60596 if (IsStrict) {
60597 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60598 {N->getOperand(0), Src, Rnd});
60599 Chain = Cvt.getValue(1);
60600 } else {
60601 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60602 }
60603
60604 // Extract down to real number of elements.
60605 if (NumElts < 8) {
60607 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60608 DAG.getVectorIdxConstant(0, dl));
60609 }
60610
60611 Cvt = DAG.getBitcast(VT, Cvt);
60612
60613 if (IsStrict)
60614 return DAG.getMergeValues({Cvt, Chain}, dl);
60615
60616 return Cvt;
60617}
60618
60620 SDValue Src = N->getOperand(0);
60621
60622 // Turn MOVDQ2Q+simple_load into an mmx load.
60623 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60624 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60625
60626 if (LN->isSimple()) {
60627 SDValue NewLd =
60628 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60629 LN->getPointerInfo(), LN->getBaseAlign(),
60630 LN->getMemOperand()->getFlags());
60631 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60632 return NewLd;
60633 }
60634 }
60635
60636 return SDValue();
60637}
60638
60641 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60642 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60643 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60644 return SDValue(N, 0);
60645
60646 return SDValue();
60647}
60648
60649// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60650// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60651// use x86mmx instead.
60653 SDLoc dl(N);
60654
60655 bool MadeChange = false, CastReturnVal = false;
60657 for (const SDValue &Arg : N->op_values()) {
60658 if (Arg.getValueType() == MVT::v1i64) {
60659 MadeChange = true;
60660 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60661 } else
60662 Args.push_back(Arg);
60663 }
60664 SDVTList VTs = N->getVTList();
60665 SDVTList NewVTs = VTs;
60666 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60667 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60668 NewVTArr[0] = MVT::x86mmx;
60669 NewVTs = DAG.getVTList(NewVTArr);
60670 MadeChange = true;
60671 CastReturnVal = true;
60672 }
60673
60674 if (MadeChange) {
60675 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60676 if (CastReturnVal) {
60678 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60679 Returns.push_back(Result.getValue(i));
60680 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60681 return DAG.getMergeValues(Returns, dl);
60682 }
60683 return Result;
60684 }
60685 return SDValue();
60686}
60689 if (!DCI.isBeforeLegalize())
60690 return SDValue();
60691
60692 unsigned IntNo = N->getConstantOperandVal(0);
60693 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60694
60695 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60696 return FixupMMXIntrinsicTypes(N, DAG);
60697
60698 return SDValue();
60699}
60700
60703 if (!DCI.isBeforeLegalize())
60704 return SDValue();
60705
60706 unsigned IntNo = N->getConstantOperandVal(1);
60707 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60708
60709 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60710 return FixupMMXIntrinsicTypes(N, DAG);
60711
60712 return SDValue();
60713}
60714
60717 if (!DCI.isBeforeLegalize())
60718 return SDValue();
60719
60720 unsigned IntNo = N->getConstantOperandVal(1);
60721 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60722
60723 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60724 return FixupMMXIntrinsicTypes(N, DAG);
60725
60726 return SDValue();
60727}
60728
60730 DAGCombinerInfo &DCI) const {
60731 SelectionDAG &DAG = DCI.DAG;
60732 switch (N->getOpcode()) {
60733 // clang-format off
60734 default: break;
60736 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60738 case X86ISD::PEXTRW:
60739 case X86ISD::PEXTRB:
60740 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60742 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60744 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60746 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60747 case ISD::VSELECT:
60748 case ISD::SELECT:
60749 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60750 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60751 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60752 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60753 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60754 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60755 case X86ISD::ADD:
60756 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60757 case X86ISD::CLOAD:
60758 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60759 case X86ISD::SBB: return combineSBB(N, DAG);
60760 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60761 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60762 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60763 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60764 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60765 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60766 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60767 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60768 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60769 case ISD::AVGCEILS:
60770 case ISD::AVGCEILU:
60771 case ISD::AVGFLOORS:
60772 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60773 case X86ISD::BEXTR:
60774 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60775 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60776 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60777 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60778 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60780 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60781 case ISD::SINT_TO_FP:
60783 return combineSIntToFP(N, DAG, DCI, Subtarget);
60784 case ISD::UINT_TO_FP:
60786 return combineUIntToFP(N, DAG, Subtarget);
60787 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60788 case ISD::LRINT:
60789 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60790 case ISD::FADD:
60791 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60792 case X86ISD::VFCMULC:
60793 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60794 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60795 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60796 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60797 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60798 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60799 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60800 case X86ISD::FXOR:
60801 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60802 case X86ISD::FMIN:
60803 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60804 case ISD::FMINNUM:
60805 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60806 case X86ISD::CVTSI2P:
60807 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60808 case X86ISD::CVTP2SI:
60809 case X86ISD::CVTP2UI:
60811 case X86ISD::CVTTP2SI:
60813 case X86ISD::CVTTP2UI:
60814 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60816 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60817 case X86ISD::BT: return combineBT(N, DAG, DCI);
60818 case ISD::ANY_EXTEND:
60819 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60820 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60821 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60825 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60826 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60827 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60828 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60829 case X86ISD::PACKSS:
60830 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60831 case X86ISD::HADD:
60832 case X86ISD::HSUB:
60833 case X86ISD::FHADD:
60834 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60835 case X86ISD::VSHL:
60836 case X86ISD::VSRA:
60837 case X86ISD::VSRL:
60838 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60839 case X86ISD::VSHLI:
60840 case X86ISD::VSRAI:
60841 case X86ISD::VSRLI:
60842 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60844 case X86ISD::PINSRB:
60845 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60846 case X86ISD::SHUFP: // Handle all target specific shuffles
60847 case X86ISD::INSERTPS:
60848 case X86ISD::EXTRQI:
60849 case X86ISD::INSERTQI:
60850 case X86ISD::VALIGN:
60851 case X86ISD::PALIGNR:
60852 case X86ISD::VSHLDQ:
60853 case X86ISD::VSRLDQ:
60854 case X86ISD::BLENDI:
60855 case X86ISD::UNPCKH:
60856 case X86ISD::UNPCKL:
60857 case X86ISD::MOVHLPS:
60858 case X86ISD::MOVLHPS:
60859 case X86ISD::PSHUFB:
60860 case X86ISD::PSHUFD:
60861 case X86ISD::PSHUFHW:
60862 case X86ISD::PSHUFLW:
60863 case X86ISD::MOVSHDUP:
60864 case X86ISD::MOVSLDUP:
60865 case X86ISD::MOVDDUP:
60866 case X86ISD::MOVSS:
60867 case X86ISD::MOVSD:
60868 case X86ISD::MOVSH:
60869 case X86ISD::VBROADCAST:
60870 case X86ISD::VPPERM:
60871 case X86ISD::VPERMI:
60872 case X86ISD::VPERMV:
60873 case X86ISD::VPERMV3:
60874 case X86ISD::VPERMIL2:
60875 case X86ISD::VPERMILPI:
60876 case X86ISD::VPERMILPV:
60877 case X86ISD::VPERM2X128:
60878 case X86ISD::SHUF128:
60879 case X86ISD::VZEXT_MOVL:
60880 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60881 case X86ISD::FMADD_RND:
60882 case X86ISD::FMSUB:
60884 case X86ISD::FMSUB_RND:
60885 case X86ISD::FNMADD:
60887 case X86ISD::FNMADD_RND:
60888 case X86ISD::FNMSUB:
60890 case X86ISD::FNMSUB_RND:
60891 case ISD::FMA:
60892 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60895 case X86ISD::FMADDSUB:
60896 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60897 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60898 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60899 case X86ISD::MGATHER:
60900 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60901 case ISD::MGATHER:
60902 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60903 case X86ISD::PCMPEQ:
60904 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60905 case X86ISD::PMULDQ:
60906 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60907 case X86ISD::VPMADDUBSW:
60908 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60909 case X86ISD::VPMADD52L:
60910 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60911 case X86ISD::KSHIFTL:
60912 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60913 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60915 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60917 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60919 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60920 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60921 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60922 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60923 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60924 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60926 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60927 // clang-format on
60928 }
60929
60930 return SDValue();
60931}
60932
60934 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60935}
60936
60937// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60939 EVT ExtVT) const {
60940 return Subtarget.hasAVX512() || !VT.isVector();
60941}
60942
60944 if (!isTypeLegal(VT))
60945 return false;
60946
60947 // There are no vXi8 shifts.
60948 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60949 return false;
60950
60951 // TODO: Almost no 8-bit ops are desirable because they have no actual
60952 // size/speed advantages vs. 32-bit ops, but they do have a major
60953 // potential disadvantage by causing partial register stalls.
60954 //
60955 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60956 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60957 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60958 // check for a constant operand to the multiply.
60959 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60960 return false;
60961
60962 // i16 instruction encodings are longer and some i16 instructions are slow,
60963 // so those are not desirable.
60964 if (VT == MVT::i16) {
60965 switch (Opc) {
60966 default:
60967 break;
60968 case ISD::LOAD:
60969 case ISD::SIGN_EXTEND:
60970 case ISD::ZERO_EXTEND:
60971 case ISD::ANY_EXTEND:
60972 case ISD::MUL:
60973 return false;
60974 case ISD::SHL:
60975 case ISD::SRA:
60976 case ISD::SRL:
60977 case ISD::SUB:
60978 case ISD::ADD:
60979 case ISD::AND:
60980 case ISD::OR:
60981 case ISD::XOR:
60982 // NDD instruction never has "partial register write" issue b/c it has
60983 // destination register's upper bits [63:OSIZE]) zeroed even when
60984 // OSIZE=8/16.
60985 return Subtarget.hasNDD();
60986 }
60987 }
60988
60989 // Any legal type not explicitly accounted for above here is desirable.
60990 return true;
60991}
60992
60994 SDValue Value, SDValue Addr,
60995 int JTI,
60996 SelectionDAG &DAG) const {
60997 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60998 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60999 if (IsCFProtectionSupported) {
61000 // In case control-flow branch protection is enabled, we need to add
61001 // notrack prefix to the indirect branch.
61002 // In order to do that we create NT_BRIND SDNode.
61003 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61004 SDValue Chain = Value;
61005 // Jump table debug info is only needed if CodeView is enabled.
61007 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61008 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61009 }
61010
61011 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61012}
61013
61016 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61018 EVT VT = LogicOp->getValueType(0);
61019 EVT OpVT = SETCC0->getOperand(0).getValueType();
61020 if (!VT.isInteger())
61022
61023 if (VT.isVector())
61028
61029 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61030 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61031 // `NotAnd` applies, `AddAnd` does as well.
61032 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61033 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61035}
61036
61038 EVT VT = Op.getValueType();
61039 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61040 isa<ConstantSDNode>(Op.getOperand(1));
61041
61042 // i16 is legal, but undesirable since i16 instruction encodings are longer
61043 // and some i16 instructions are slow.
61044 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61045 // using LEA and/or other ALU ops.
61046 if (VT != MVT::i16 && !Is8BitMulByConstant)
61047 return false;
61048
61049 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61050 if (!Op.hasOneUse())
61051 return false;
61052 SDNode *User = *Op->user_begin();
61054 return false;
61055 auto *Ld = cast<LoadSDNode>(Load);
61056 auto *St = cast<StoreSDNode>(User);
61057 return Ld->getBasePtr() == St->getBasePtr();
61058 };
61059
61060 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61061 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61062 return false;
61063 if (!Op.hasOneUse())
61064 return false;
61065 SDNode *User = *Op->user_begin();
61066 if (User->getOpcode() != ISD::ATOMIC_STORE)
61067 return false;
61068 auto *Ld = cast<AtomicSDNode>(Load);
61069 auto *St = cast<AtomicSDNode>(User);
61070 return Ld->getBasePtr() == St->getBasePtr();
61071 };
61072
61073 auto IsFoldableZext = [](SDValue Op) {
61074 if (!Op.hasOneUse())
61075 return false;
61076 SDNode *User = *Op->user_begin();
61077 EVT VT = User->getValueType(0);
61078 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61079 (VT == MVT::i32 || VT == MVT::i64));
61080 };
61081
61082 bool Commute = false;
61083 switch (Op.getOpcode()) {
61084 default: return false;
61085 case ISD::SIGN_EXTEND:
61086 case ISD::ZERO_EXTEND:
61087 case ISD::ANY_EXTEND:
61088 break;
61089 case ISD::SHL:
61090 case ISD::SRA:
61091 case ISD::SRL: {
61092 SDValue N0 = Op.getOperand(0);
61093 // Look out for (store (shl (load), x)).
61094 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61095 return false;
61096 break;
61097 }
61098 case ISD::MUL:
61099 // When ZU is enabled, we prefer to not promote for MUL by a constant
61100 // when there is an opportunity to fold a zext with imulzu.
61101 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61102 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61103 isa<ConstantSDNode>(Op.getOperand(1))))
61104 return false;
61105 [[fallthrough]];
61106 case ISD::ADD:
61107 case ISD::AND:
61108 case ISD::OR:
61109 case ISD::XOR:
61110 Commute = true;
61111 [[fallthrough]];
61112 case ISD::SUB: {
61113 SDValue N0 = Op.getOperand(0);
61114 SDValue N1 = Op.getOperand(1);
61115 // Avoid disabling potential load folding opportunities.
61116 if (X86::mayFoldLoad(N1, Subtarget) &&
61117 (!Commute || !isa<ConstantSDNode>(N0) ||
61118 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61119 return false;
61120 if (X86::mayFoldLoad(N0, Subtarget) &&
61121 ((Commute && !isa<ConstantSDNode>(N1)) ||
61122 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61123 return false;
61124 if (IsFoldableAtomicRMW(N0, Op) ||
61125 (Commute && IsFoldableAtomicRMW(N1, Op)))
61126 return false;
61127 }
61128 }
61129
61130 PVT = MVT::i32;
61131 return true;
61132}
61133
61134//===----------------------------------------------------------------------===//
61135// X86 Inline Assembly Support
61136//===----------------------------------------------------------------------===//
61137
61140 .Case("{@cca}", X86::COND_A)
61141 .Case("{@ccae}", X86::COND_AE)
61142 .Case("{@ccb}", X86::COND_B)
61143 .Case("{@ccbe}", X86::COND_BE)
61144 .Case("{@ccc}", X86::COND_B)
61145 .Case("{@cce}", X86::COND_E)
61146 .Case("{@ccz}", X86::COND_E)
61147 .Case("{@ccg}", X86::COND_G)
61148 .Case("{@ccge}", X86::COND_GE)
61149 .Case("{@ccl}", X86::COND_L)
61150 .Case("{@ccle}", X86::COND_LE)
61151 .Case("{@ccna}", X86::COND_BE)
61152 .Case("{@ccnae}", X86::COND_B)
61153 .Case("{@ccnb}", X86::COND_AE)
61154 .Case("{@ccnbe}", X86::COND_A)
61155 .Case("{@ccnc}", X86::COND_AE)
61156 .Case("{@ccne}", X86::COND_NE)
61157 .Case("{@ccnz}", X86::COND_NE)
61158 .Case("{@ccng}", X86::COND_LE)
61159 .Case("{@ccnge}", X86::COND_L)
61160 .Case("{@ccnl}", X86::COND_GE)
61161 .Case("{@ccnle}", X86::COND_G)
61162 .Case("{@ccno}", X86::COND_NO)
61163 .Case("{@ccnp}", X86::COND_NP)
61164 .Case("{@ccns}", X86::COND_NS)
61165 .Case("{@cco}", X86::COND_O)
61166 .Case("{@ccp}", X86::COND_P)
61167 .Case("{@ccs}", X86::COND_S)
61169 return Cond;
61170}
61171
61172/// Given a constraint letter, return the type of constraint for this target.
61175 if (Constraint.size() == 1) {
61176 switch (Constraint[0]) {
61177 case 'R':
61178 case 'q':
61179 case 'Q':
61180 case 'f':
61181 case 't':
61182 case 'u':
61183 case 'y':
61184 case 'x':
61185 case 'v':
61186 case 'l':
61187 case 'k': // AVX512 masking registers.
61188 return C_RegisterClass;
61189 case 'a':
61190 case 'b':
61191 case 'c':
61192 case 'd':
61193 case 'S':
61194 case 'D':
61195 case 'A':
61196 return C_Register;
61197 case 'I':
61198 case 'J':
61199 case 'K':
61200 case 'N':
61201 case 'G':
61202 case 'L':
61203 case 'M':
61204 return C_Immediate;
61205 case 'C':
61206 case 'e':
61207 case 'Z':
61208 return C_Other;
61209 default:
61210 break;
61211 }
61212 }
61213 else if (Constraint.size() == 2) {
61214 switch (Constraint[0]) {
61215 default:
61216 break;
61217 case 'W':
61218 if (Constraint[1] != 's')
61219 break;
61220 return C_Other;
61221 case 'Y':
61222 switch (Constraint[1]) {
61223 default:
61224 break;
61225 case 'z':
61226 return C_Register;
61227 case 'i':
61228 case 'm':
61229 case 'k':
61230 case 't':
61231 case '2':
61232 return C_RegisterClass;
61233 }
61234 break;
61235 case 'j':
61236 switch (Constraint[1]) {
61237 default:
61238 break;
61239 case 'r':
61240 case 'R':
61241 return C_RegisterClass;
61242 }
61243 }
61244 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61245 return C_Other;
61246 return TargetLowering::getConstraintType(Constraint);
61247}
61248
61249/// Examine constraint type and operand type and determine a weight value.
61250/// This object must already have been set up with the operand type
61251/// and the current alternative constraint selected.
61254 AsmOperandInfo &Info, const char *Constraint) const {
61256 Value *CallOperandVal = Info.CallOperandVal;
61257 // If we don't have a value, we can't do a match,
61258 // but allow it at the lowest weight.
61259 if (!CallOperandVal)
61260 return CW_Default;
61261 Type *Ty = CallOperandVal->getType();
61262 // Look at the constraint type.
61263 switch (*Constraint) {
61264 default:
61266 [[fallthrough]];
61267 case 'R':
61268 case 'q':
61269 case 'Q':
61270 case 'a':
61271 case 'b':
61272 case 'c':
61273 case 'd':
61274 case 'S':
61275 case 'D':
61276 case 'A':
61277 if (CallOperandVal->getType()->isIntegerTy())
61278 Wt = CW_SpecificReg;
61279 break;
61280 case 'f':
61281 case 't':
61282 case 'u':
61283 if (Ty->isFloatingPointTy())
61284 Wt = CW_SpecificReg;
61285 break;
61286 case 'y':
61287 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61288 Wt = CW_SpecificReg;
61289 break;
61290 case 'Y':
61291 if (StringRef(Constraint).size() != 2)
61292 break;
61293 switch (Constraint[1]) {
61294 default:
61295 return CW_Invalid;
61296 // XMM0
61297 case 'z':
61298 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61299 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61300 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61301 return CW_SpecificReg;
61302 return CW_Invalid;
61303 // Conditional OpMask regs (AVX512)
61304 case 'k':
61305 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61306 return CW_Register;
61307 return CW_Invalid;
61308 // Any MMX reg
61309 case 'm':
61310 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61311 return CW_SpecificReg;
61312 return CW_Invalid;
61313 // Any SSE reg when ISA >= SSE2, same as 'x'
61314 case 'i':
61315 case 't':
61316 case '2':
61317 if (!Subtarget.hasSSE2())
61318 return CW_Invalid;
61319 break;
61320 }
61321 break;
61322 case 'j':
61323 if (StringRef(Constraint).size() != 2)
61324 break;
61325 switch (Constraint[1]) {
61326 default:
61327 return CW_Invalid;
61328 case 'r':
61329 case 'R':
61330 if (CallOperandVal->getType()->isIntegerTy())
61331 Wt = CW_SpecificReg;
61332 break;
61333 }
61334 break;
61335 case 'v':
61336 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61337 Wt = CW_Register;
61338 [[fallthrough]];
61339 case 'x':
61340 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61341 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61342 Wt = CW_Register;
61343 break;
61344 case 'k':
61345 // Enable conditional vector operations using %k<#> registers.
61346 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61347 Wt = CW_Register;
61348 break;
61349 case 'I':
61350 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61351 if (C->getZExtValue() <= 31)
61352 Wt = CW_Constant;
61353 break;
61354 case 'J':
61355 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61356 if (C->getZExtValue() <= 63)
61357 Wt = CW_Constant;
61358 break;
61359 case 'K':
61360 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61361 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61362 Wt = CW_Constant;
61363 break;
61364 case 'L':
61365 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61366 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61367 Wt = CW_Constant;
61368 break;
61369 case 'M':
61370 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61371 if (C->getZExtValue() <= 3)
61372 Wt = CW_Constant;
61373 break;
61374 case 'N':
61375 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61376 if (C->getZExtValue() <= 0xff)
61377 Wt = CW_Constant;
61378 break;
61379 case 'G':
61380 case 'C':
61381 if (isa<ConstantFP>(CallOperandVal))
61382 Wt = CW_Constant;
61383 break;
61384 case 'e':
61385 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61386 if ((C->getSExtValue() >= -0x80000000LL) &&
61387 (C->getSExtValue() <= 0x7fffffffLL))
61388 Wt = CW_Constant;
61389 break;
61390 case 'Z':
61391 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61392 if (C->getZExtValue() <= 0xffffffff)
61393 Wt = CW_Constant;
61394 break;
61395 }
61396 return Wt;
61397}
61398
61399/// Try to replace an X constraint, which matches anything, with another that
61400/// has more specific requirements based on the type of the corresponding
61401/// operand.
61403LowerXConstraint(EVT ConstraintVT) const {
61404 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61405 // 'f' like normal targets.
61406 if (ConstraintVT.isFloatingPoint()) {
61407 if (Subtarget.hasSSE1())
61408 return "x";
61409 }
61410
61411 return TargetLowering::LowerXConstraint(ConstraintVT);
61412}
61413
61414// Lower @cc targets via setcc.
61416 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61417 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61418 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61419 if (Cond == X86::COND_INVALID)
61420 return SDValue();
61421 // Check that return type is valid.
61422 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61423 OpInfo.ConstraintVT.getSizeInBits() < 8)
61424 report_fatal_error("Glue output operand is of invalid type");
61425
61426 // Get EFLAGS register. Only update chain when copyfrom is glued.
61427 if (Glue.getNode()) {
61428 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61429 Chain = Glue.getValue(1);
61430 } else
61431 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61432 // Extract CC code.
61433 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61434 // Extend to 32-bits
61435 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61436
61437 return Result;
61438}
61439
61440/// Lower the specified operand into the Ops vector.
61441/// If it is invalid, don't add anything to Ops.
61443 StringRef Constraint,
61444 std::vector<SDValue> &Ops,
61445 SelectionDAG &DAG) const {
61446 SDValue Result;
61447 char ConstraintLetter = Constraint[0];
61448 switch (ConstraintLetter) {
61449 default: break;
61450 case 'I':
61451 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61452 if (C->getZExtValue() <= 31) {
61453 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61454 Op.getValueType());
61455 break;
61456 }
61457 }
61458 return;
61459 case 'J':
61460 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61461 if (C->getZExtValue() <= 63) {
61462 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61463 Op.getValueType());
61464 break;
61465 }
61466 }
61467 return;
61468 case 'K':
61469 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61470 if (isInt<8>(C->getSExtValue())) {
61471 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61472 Op.getValueType());
61473 break;
61474 }
61475 }
61476 return;
61477 case 'L':
61478 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61479 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61480 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61481 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61482 Op.getValueType());
61483 break;
61484 }
61485 }
61486 return;
61487 case 'M':
61488 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61489 if (C->getZExtValue() <= 3) {
61490 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61491 Op.getValueType());
61492 break;
61493 }
61494 }
61495 return;
61496 case 'N':
61497 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61498 if (C->getZExtValue() <= 255) {
61499 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61500 Op.getValueType());
61501 break;
61502 }
61503 }
61504 return;
61505 case 'O':
61506 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61507 if (C->getZExtValue() <= 127) {
61508 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61509 Op.getValueType());
61510 break;
61511 }
61512 }
61513 return;
61514 case 'e': {
61515 // 32-bit signed value
61516 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61518 C->getSExtValue())) {
61519 // Widen to 64 bits here to get it sign extended.
61520 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61521 break;
61522 }
61523 // FIXME gcc accepts some relocatable values here too, but only in certain
61524 // memory models; it's complicated.
61525 }
61526 return;
61527 }
61528 case 'W': {
61529 assert(Constraint[1] == 's');
61530 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61531 // offset.
61532 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61533 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61534 BA->getValueType(0)));
61535 } else {
61536 int64_t Offset = 0;
61537 if (Op->getOpcode() == ISD::ADD &&
61538 isa<ConstantSDNode>(Op->getOperand(1))) {
61539 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61540 Op = Op->getOperand(0);
61541 }
61542 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61543 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61544 GA->getValueType(0), Offset));
61545 }
61546 return;
61547 }
61548 case 'Z': {
61549 // 32-bit unsigned value
61550 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61552 C->getZExtValue())) {
61553 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61554 Op.getValueType());
61555 break;
61556 }
61557 }
61558 // FIXME gcc accepts some relocatable values here too, but only in certain
61559 // memory models; it's complicated.
61560 return;
61561 }
61562 case 'i': {
61563 // Literal immediates are always ok.
61564 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61565 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61566 BooleanContent BCont = getBooleanContents(MVT::i64);
61567 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61569 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61570 : CST->getSExtValue();
61571 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61572 break;
61573 }
61574
61575 // In any sort of PIC mode addresses need to be computed at runtime by
61576 // adding in a register or some sort of table lookup. These can't
61577 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61578 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61580 return;
61581
61582 // If we are in non-pic codegen mode, we allow the address of a global (with
61583 // an optional displacement) to be used with 'i'.
61584 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61585 // If we require an extra load to get this address, as in PIC mode, we
61586 // can't accept it.
61588 Subtarget.classifyGlobalReference(GA->getGlobal())))
61589 return;
61590 break;
61591 }
61592 }
61593
61594 if (Result.getNode()) {
61595 Ops.push_back(Result);
61596 return;
61597 }
61598 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61599}
61600
61601/// Check if \p RC is a general purpose register class.
61602/// I.e., GR* or one of their variant.
61603static bool isGRClass(const TargetRegisterClass &RC) {
61604 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61605 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61606 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61607 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61608 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61609}
61610
61611/// Check if \p RC is a vector register class.
61612/// I.e., FR* / VR* or one of their variant.
61613static bool isFRClass(const TargetRegisterClass &RC) {
61614 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61615 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61616 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61617 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61618 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61619 RC.hasSuperClassEq(&X86::VR512RegClass);
61620}
61621
61622/// Check if \p RC is a mask register class.
61623/// I.e., VK* or one of their variant.
61624static bool isVKClass(const TargetRegisterClass &RC) {
61625 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61626 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61627 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61628 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61629 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61630 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61631 RC.hasSuperClassEq(&X86::VK64RegClass);
61632}
61633
61634static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61635 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61636}
61637
61638std::pair<unsigned, const TargetRegisterClass *>
61640 StringRef Constraint,
61641 MVT VT) const {
61642 // First, see if this is a constraint that directly corresponds to an LLVM
61643 // register class.
61644 if (Constraint.size() == 1) {
61645 // GCC Constraint Letters
61646 switch (Constraint[0]) {
61647 default: break;
61648 // 'A' means [ER]AX + [ER]DX.
61649 case 'A':
61650 if (Subtarget.is64Bit())
61651 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61652 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61653 "Expecting 64, 32 or 16 bit subtarget");
61654 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61655
61656 // TODO: Slight differences here in allocation order and leaving
61657 // RIP in the class. Do they matter any more here than they do
61658 // in the normal allocation?
61659 case 'k':
61660 if (Subtarget.hasAVX512()) {
61661 if (VT == MVT::v1i1 || VT == MVT::i1)
61662 return std::make_pair(0U, &X86::VK1RegClass);
61663 if (VT == MVT::v8i1 || VT == MVT::i8)
61664 return std::make_pair(0U, &X86::VK8RegClass);
61665 if (VT == MVT::v16i1 || VT == MVT::i16)
61666 return std::make_pair(0U, &X86::VK16RegClass);
61667 }
61668 if (Subtarget.hasBWI()) {
61669 if (VT == MVT::v32i1 || VT == MVT::i32)
61670 return std::make_pair(0U, &X86::VK32RegClass);
61671 if (VT == MVT::v64i1 || VT == MVT::i64)
61672 return std::make_pair(0U, &X86::VK64RegClass);
61673 }
61674 break;
61675 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61676 if (Subtarget.is64Bit()) {
61677 if (VT == MVT::i8 || VT == MVT::i1)
61678 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61679 ? &X86::GR8RegClass
61680 : &X86::GR8_NOREX2RegClass);
61681 if (VT == MVT::i16)
61682 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61683 ? &X86::GR16RegClass
61684 : &X86::GR16_NOREX2RegClass);
61685 if (VT == MVT::i32 || VT == MVT::f32)
61686 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61687 ? &X86::GR32RegClass
61688 : &X86::GR32_NOREX2RegClass);
61689 if (VT != MVT::f80 && !VT.isVector())
61690 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61691 ? &X86::GR64RegClass
61692 : &X86::GR64_NOREX2RegClass);
61693 break;
61694 }
61695 [[fallthrough]];
61696 // 32-bit fallthrough
61697 case 'Q': // Q_REGS
61698 if (VT == MVT::i8 || VT == MVT::i1)
61699 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61700 if (VT == MVT::i16)
61701 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61702 if (VT == MVT::i32 || VT == MVT::f32 ||
61703 (!VT.isVector() && !Subtarget.is64Bit()))
61704 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61705 if (VT != MVT::f80 && !VT.isVector())
61706 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61707 break;
61708 case 'r': // GENERAL_REGS
61709 case 'l': // INDEX_REGS
61710 if (VT == MVT::i8 || VT == MVT::i1)
61711 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61712 ? &X86::GR8RegClass
61713 : &X86::GR8_NOREX2RegClass);
61714 if (VT == MVT::i16)
61715 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61716 ? &X86::GR16RegClass
61717 : &X86::GR16_NOREX2RegClass);
61718 if (VT == MVT::i32 || VT == MVT::f32 ||
61719 (!VT.isVector() && !Subtarget.is64Bit()))
61720 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61721 ? &X86::GR32RegClass
61722 : &X86::GR32_NOREX2RegClass);
61723 if (VT != MVT::f80 && !VT.isVector())
61724 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61725 ? &X86::GR64RegClass
61726 : &X86::GR64_NOREX2RegClass);
61727 break;
61728 case 'R': // LEGACY_REGS
61729 if (VT == MVT::i8 || VT == MVT::i1)
61730 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61731 if (VT == MVT::i16)
61732 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61733 if (VT == MVT::i32 || VT == MVT::f32 ||
61734 (!VT.isVector() && !Subtarget.is64Bit()))
61735 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61736 if (VT != MVT::f80 && !VT.isVector())
61737 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61738 break;
61739 case 'f': // FP Stack registers.
61740 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61741 // value to the correct fpstack register class.
61742 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61743 return std::make_pair(0U, &X86::RFP32RegClass);
61744 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61745 return std::make_pair(0U, &X86::RFP64RegClass);
61746 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61747 return std::make_pair(0U, &X86::RFP80RegClass);
61748 break;
61749 case 'y': // MMX_REGS if MMX allowed.
61750 if (!Subtarget.hasMMX()) break;
61751 return std::make_pair(0U, &X86::VR64RegClass);
61752 case 'v':
61753 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61754 if (!Subtarget.hasSSE1()) break;
61755 bool VConstraint = (Constraint[0] == 'v');
61756
61757 switch (VT.SimpleTy) {
61758 default: break;
61759 // Scalar SSE types.
61760 case MVT::f16:
61761 if (VConstraint && Subtarget.hasFP16())
61762 return std::make_pair(0U, &X86::FR16XRegClass);
61763 break;
61764 case MVT::f32:
61765 case MVT::i32:
61766 if (VConstraint && Subtarget.hasVLX())
61767 return std::make_pair(0U, &X86::FR32XRegClass);
61768 return std::make_pair(0U, &X86::FR32RegClass);
61769 case MVT::f64:
61770 case MVT::i64:
61771 if (VConstraint && Subtarget.hasVLX())
61772 return std::make_pair(0U, &X86::FR64XRegClass);
61773 return std::make_pair(0U, &X86::FR64RegClass);
61774 case MVT::i128:
61775 if (Subtarget.is64Bit()) {
61776 if (VConstraint && Subtarget.hasVLX())
61777 return std::make_pair(0U, &X86::VR128XRegClass);
61778 return std::make_pair(0U, &X86::VR128RegClass);
61779 }
61780 break;
61781 // Vector types and fp128.
61782 case MVT::v8f16:
61783 if (!Subtarget.hasFP16())
61784 break;
61785 if (VConstraint)
61786 return std::make_pair(0U, &X86::VR128XRegClass);
61787 return std::make_pair(0U, &X86::VR128RegClass);
61788 case MVT::v8bf16:
61789 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61790 break;
61791 if (VConstraint)
61792 return std::make_pair(0U, &X86::VR128XRegClass);
61793 return std::make_pair(0U, &X86::VR128RegClass);
61794 case MVT::f128:
61795 if (!Subtarget.is64Bit())
61796 break;
61797 [[fallthrough]];
61798 case MVT::v16i8:
61799 case MVT::v8i16:
61800 case MVT::v4i32:
61801 case MVT::v2i64:
61802 case MVT::v4f32:
61803 case MVT::v2f64:
61804 if (VConstraint && Subtarget.hasVLX())
61805 return std::make_pair(0U, &X86::VR128XRegClass);
61806 return std::make_pair(0U, &X86::VR128RegClass);
61807 // AVX types.
61808 case MVT::v16f16:
61809 if (!Subtarget.hasFP16())
61810 break;
61811 if (VConstraint)
61812 return std::make_pair(0U, &X86::VR256XRegClass);
61813 return std::make_pair(0U, &X86::VR256RegClass);
61814 case MVT::v16bf16:
61815 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61816 break;
61817 if (VConstraint)
61818 return std::make_pair(0U, &X86::VR256XRegClass);
61819 return std::make_pair(0U, &X86::VR256RegClass);
61820 case MVT::v32i8:
61821 case MVT::v16i16:
61822 case MVT::v8i32:
61823 case MVT::v4i64:
61824 case MVT::v8f32:
61825 case MVT::v4f64:
61826 if (VConstraint && Subtarget.hasVLX())
61827 return std::make_pair(0U, &X86::VR256XRegClass);
61828 if (Subtarget.hasAVX())
61829 return std::make_pair(0U, &X86::VR256RegClass);
61830 break;
61831 case MVT::v32f16:
61832 if (!Subtarget.hasFP16())
61833 break;
61834 if (VConstraint)
61835 return std::make_pair(0U, &X86::VR512RegClass);
61836 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61837 case MVT::v32bf16:
61838 if (!Subtarget.hasBF16())
61839 break;
61840 if (VConstraint)
61841 return std::make_pair(0U, &X86::VR512RegClass);
61842 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61843 case MVT::v64i8:
61844 case MVT::v32i16:
61845 case MVT::v8f64:
61846 case MVT::v16f32:
61847 case MVT::v16i32:
61848 case MVT::v8i64:
61849 if (!Subtarget.hasAVX512()) break;
61850 if (VConstraint)
61851 return std::make_pair(0U, &X86::VR512RegClass);
61852 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61853 }
61854 break;
61855 }
61856 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61857 switch (Constraint[1]) {
61858 default:
61859 break;
61860 case 'i':
61861 case 't':
61862 case '2':
61863 return getRegForInlineAsmConstraint(TRI, "x", VT);
61864 case 'm':
61865 if (!Subtarget.hasMMX()) break;
61866 return std::make_pair(0U, &X86::VR64RegClass);
61867 case 'z':
61868 if (!Subtarget.hasSSE1()) break;
61869 switch (VT.SimpleTy) {
61870 default: break;
61871 // Scalar SSE types.
61872 case MVT::f16:
61873 if (!Subtarget.hasFP16())
61874 break;
61875 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61876 case MVT::f32:
61877 case MVT::i32:
61878 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61879 case MVT::f64:
61880 case MVT::i64:
61881 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61882 case MVT::v8f16:
61883 if (!Subtarget.hasFP16())
61884 break;
61885 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61886 case MVT::v8bf16:
61887 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61888 break;
61889 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61890 case MVT::f128:
61891 case MVT::v16i8:
61892 case MVT::v8i16:
61893 case MVT::v4i32:
61894 case MVT::v2i64:
61895 case MVT::v4f32:
61896 case MVT::v2f64:
61897 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61898 // AVX types.
61899 case MVT::v16f16:
61900 if (!Subtarget.hasFP16())
61901 break;
61902 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61903 case MVT::v16bf16:
61904 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61905 break;
61906 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61907 case MVT::v32i8:
61908 case MVT::v16i16:
61909 case MVT::v8i32:
61910 case MVT::v4i64:
61911 case MVT::v8f32:
61912 case MVT::v4f64:
61913 if (Subtarget.hasAVX())
61914 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61915 break;
61916 case MVT::v32f16:
61917 if (!Subtarget.hasFP16())
61918 break;
61919 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61920 case MVT::v32bf16:
61921 if (!Subtarget.hasBF16())
61922 break;
61923 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61924 case MVT::v64i8:
61925 case MVT::v32i16:
61926 case MVT::v8f64:
61927 case MVT::v16f32:
61928 case MVT::v16i32:
61929 case MVT::v8i64:
61930 if (Subtarget.hasAVX512())
61931 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61932 break;
61933 }
61934 break;
61935 case 'k':
61936 // This register class doesn't allocate k0 for masked vector operation.
61937 if (Subtarget.hasAVX512()) {
61938 if (VT == MVT::v1i1 || VT == MVT::i1)
61939 return std::make_pair(0U, &X86::VK1WMRegClass);
61940 if (VT == MVT::v8i1 || VT == MVT::i8)
61941 return std::make_pair(0U, &X86::VK8WMRegClass);
61942 if (VT == MVT::v16i1 || VT == MVT::i16)
61943 return std::make_pair(0U, &X86::VK16WMRegClass);
61944 }
61945 if (Subtarget.hasBWI()) {
61946 if (VT == MVT::v32i1 || VT == MVT::i32)
61947 return std::make_pair(0U, &X86::VK32WMRegClass);
61948 if (VT == MVT::v64i1 || VT == MVT::i64)
61949 return std::make_pair(0U, &X86::VK64WMRegClass);
61950 }
61951 break;
61952 }
61953 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61954 switch (Constraint[1]) {
61955 default:
61956 break;
61957 case 'r':
61958 if (VT == MVT::i8 || VT == MVT::i1)
61959 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61960 if (VT == MVT::i16)
61961 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61962 if (VT == MVT::i32 || VT == MVT::f32)
61963 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61964 if (VT != MVT::f80 && !VT.isVector())
61965 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61966 break;
61967 case 'R':
61968 if (VT == MVT::i8 || VT == MVT::i1)
61969 return std::make_pair(0U, &X86::GR8RegClass);
61970 if (VT == MVT::i16)
61971 return std::make_pair(0U, &X86::GR16RegClass);
61972 if (VT == MVT::i32 || VT == MVT::f32)
61973 return std::make_pair(0U, &X86::GR32RegClass);
61974 if (VT != MVT::f80 && !VT.isVector())
61975 return std::make_pair(0U, &X86::GR64RegClass);
61976 break;
61977 }
61978 }
61979
61980 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61981 return std::make_pair(0U, &X86::GR32RegClass);
61982
61983 // Use the default implementation in TargetLowering to convert the register
61984 // constraint into a member of a register class.
61985 std::pair<Register, const TargetRegisterClass*> Res;
61987
61988 // Not found as a standard register?
61989 if (!Res.second) {
61990 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61991 // to/from f80.
61992 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61993 // Map st(0) -> st(7) -> ST0
61994 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61995 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61996 Constraint[3] == '(' &&
61997 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61998 Constraint[5] == ')' && Constraint[6] == '}') {
61999 // st(7) is not allocatable and thus not a member of RFP80. Return
62000 // singleton class in cases where we have a reference to it.
62001 if (Constraint[4] == '7')
62002 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62003 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62004 &X86::RFP80RegClass);
62005 }
62006
62007 // GCC allows "st(0)" to be called just plain "st".
62008 if (StringRef("{st}").equals_insensitive(Constraint))
62009 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62010 }
62011
62012 // flags -> EFLAGS
62013 if (StringRef("{flags}").equals_insensitive(Constraint))
62014 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62015
62016 // dirflag -> DF
62017 // Only allow for clobber.
62018 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62019 VT == MVT::Other)
62020 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62021
62022 // fpsr -> FPSW
62023 // Only allow for clobber.
62024 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62025 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62026
62027 return Res;
62028 }
62029
62030 // Make sure it isn't a register that requires 64-bit mode.
62031 if (!Subtarget.is64Bit() &&
62032 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62033 TRI->getEncodingValue(Res.first) >= 8) {
62034 // Register requires REX prefix, but we're in 32-bit mode.
62035 return std::make_pair(0, nullptr);
62036 }
62037
62038 // Make sure it isn't a register that requires AVX512.
62039 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62040 TRI->getEncodingValue(Res.first) & 0x10) {
62041 // Register requires EVEX prefix.
62042 return std::make_pair(0, nullptr);
62043 }
62044
62045 // Otherwise, check to see if this is a register class of the wrong value
62046 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62047 // turn into {ax},{dx}.
62048 // MVT::Other is used to specify clobber names.
62049 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62050 return Res; // Correct type already, nothing to do.
62051
62052 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62053 // return "eax". This should even work for things like getting 64bit integer
62054 // registers when given an f64 type.
62055 const TargetRegisterClass *Class = Res.second;
62056 // The generic code will match the first register class that contains the
62057 // given register. Thus, based on the ordering of the tablegened file,
62058 // the "plain" GR classes might not come first.
62059 // Therefore, use a helper method.
62060 if (isGRClass(*Class)) {
62061 unsigned Size = VT.getSizeInBits();
62062 if (Size == 1) Size = 8;
62063 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62064 return std::make_pair(0, nullptr);
62065 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62066 if (DestReg.isValid()) {
62067 bool is64Bit = Subtarget.is64Bit();
62068 const TargetRegisterClass *RC =
62069 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62070 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62071 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62072 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62073 if (Size == 64 && !is64Bit) {
62074 // Model GCC's behavior here and select a fixed pair of 32-bit
62075 // registers.
62076 switch (DestReg) {
62077 case X86::RAX:
62078 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62079 case X86::RDX:
62080 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62081 case X86::RCX:
62082 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62083 case X86::RBX:
62084 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62085 case X86::RSI:
62086 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62087 case X86::RDI:
62088 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62089 case X86::RBP:
62090 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62091 default:
62092 return std::make_pair(0, nullptr);
62093 }
62094 }
62095 if (RC && RC->contains(DestReg))
62096 return std::make_pair(DestReg, RC);
62097 return Res;
62098 }
62099 // No register found/type mismatch.
62100 return std::make_pair(0, nullptr);
62101 } else if (isFRClass(*Class)) {
62102 // Handle references to XMM physical registers that got mapped into the
62103 // wrong class. This can happen with constraints like {xmm0} where the
62104 // target independent register mapper will just pick the first match it can
62105 // find, ignoring the required type.
62106
62107 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62108 if (VT == MVT::f16)
62109 Res.second = &X86::FR16XRegClass;
62110 else if (VT == MVT::f32 || VT == MVT::i32)
62111 Res.second = &X86::FR32XRegClass;
62112 else if (VT == MVT::f64 || VT == MVT::i64)
62113 Res.second = &X86::FR64XRegClass;
62114 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62115 Res.second = &X86::VR128XRegClass;
62116 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62117 Res.second = &X86::VR256XRegClass;
62118 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62119 Res.second = &X86::VR512RegClass;
62120 else {
62121 // Type mismatch and not a clobber: Return an error;
62122 Res.first = 0;
62123 Res.second = nullptr;
62124 }
62125 } else if (isVKClass(*Class)) {
62126 if (VT == MVT::v1i1 || VT == MVT::i1)
62127 Res.second = &X86::VK1RegClass;
62128 else if (VT == MVT::v8i1 || VT == MVT::i8)
62129 Res.second = &X86::VK8RegClass;
62130 else if (VT == MVT::v16i1 || VT == MVT::i16)
62131 Res.second = &X86::VK16RegClass;
62132 else if (VT == MVT::v32i1 || VT == MVT::i32)
62133 Res.second = &X86::VK32RegClass;
62134 else if (VT == MVT::v64i1 || VT == MVT::i64)
62135 Res.second = &X86::VK64RegClass;
62136 else {
62137 // Type mismatch and not a clobber: Return an error;
62138 Res.first = 0;
62139 Res.second = nullptr;
62140 }
62141 }
62142
62143 return Res;
62144}
62145
62146bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62147 // Integer division on x86 is expensive. However, when aggressively optimizing
62148 // for code size, we prefer to use a div instruction, as it is usually smaller
62149 // than the alternative sequence.
62150 // The exception to this is vector division. Since x86 doesn't have vector
62151 // integer division, leaving the division as-is is a loss even in terms of
62152 // size, because it will have to be scalarized, while the alternative code
62153 // sequence can be performed in vector form.
62154 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62155 return OptSize && !VT.isVector();
62156}
62157
62158void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62159 if (!Subtarget.is64Bit())
62160 return;
62161
62162 // Update IsSplitCSR in X86MachineFunctionInfo.
62164 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62165 AFI->setIsSplitCSR(true);
62166}
62167
62168void X86TargetLowering::insertCopiesSplitCSR(
62169 MachineBasicBlock *Entry,
62170 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62171 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62172 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62173 if (!IStart)
62174 return;
62175
62176 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62177 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62178 MachineBasicBlock::iterator MBBI = Entry->begin();
62179 for (const MCPhysReg *I = IStart; *I; ++I) {
62180 const TargetRegisterClass *RC = nullptr;
62181 if (X86::GR64RegClass.contains(*I))
62182 RC = &X86::GR64RegClass;
62183 else
62184 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62185
62186 Register NewVR = MRI->createVirtualRegister(RC);
62187 // Create copy from CSR to a virtual register.
62188 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62189 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62190 // nounwind. If we want to generalize this later, we may need to emit
62191 // CFI pseudo-instructions.
62192 assert(
62193 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62194 "Function should be nounwind in insertCopiesSplitCSR!");
62195 Entry->addLiveIn(*I);
62196 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62197 .addReg(*I);
62198
62199 // Insert the copy-back instructions right before the terminator.
62200 for (auto *Exit : Exits)
62201 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62202 TII->get(TargetOpcode::COPY), *I)
62203 .addReg(NewVR);
62204 }
62205}
62206
62208 return Subtarget.is64Bit();
62209}
62210
62214 const TargetInstrInfo *TII) const {
62215 assert(MBBI->isCall() && MBBI->getCFIType() &&
62216 "Invalid call instruction for a KCFI check");
62217
62218 MachineFunction &MF = *MBB.getParent();
62219 // If the call target is a memory operand, unfold it and use R11 for the
62220 // call, so KCFI_CHECK won't have to recompute the address.
62221 switch (MBBI->getOpcode()) {
62222 case X86::CALL64m:
62223 case X86::CALL64m_NT:
62224 case X86::TAILJMPm64:
62225 case X86::TAILJMPm64_REX: {
62228 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62229 /*UnfoldStore=*/false, NewMIs))
62230 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62231 for (auto *NewMI : NewMIs)
62232 MBBI = MBB.insert(OrigCall, NewMI);
62233 assert(MBBI->isCall() &&
62234 "Unexpected instruction after memory operand unfolding");
62235 if (OrigCall->shouldUpdateAdditionalCallInfo())
62236 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62237 MBBI->setCFIType(MF, OrigCall->getCFIType());
62238 OrigCall->eraseFromParent();
62239 break;
62240 }
62241 default:
62242 break;
62243 }
62244
62245 MachineOperand &Target = MBBI->getOperand(0);
62246 Register TargetReg;
62247 switch (MBBI->getOpcode()) {
62248 case X86::CALL64r:
62249 case X86::CALL64r_ImpCall:
62250 case X86::CALL64r_NT:
62251 case X86::TAILJMPr64:
62252 case X86::TAILJMPr64_REX:
62253 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62254 Target.setIsRenamable(false);
62255 TargetReg = Target.getReg();
62256 break;
62257 case X86::CALL64pcrel32:
62258 case X86::TAILJMPd64:
62259 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62260 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62261 // 64-bit indirect thunk calls.
62262 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62263 "Unexpected register for an indirect thunk call");
62264 TargetReg = X86::R11;
62265 break;
62266 default:
62267 llvm_unreachable("Unexpected CFI call opcode");
62268 break;
62269 }
62270
62271 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62272 .addReg(TargetReg)
62273 .addImm(MBBI->getCFIType())
62274 .getInstr();
62275}
62276
62277/// Returns true if stack probing through a function call is requested.
62281
62282/// Returns true if stack probing through inline assembly is requested.
62284
62285 // No inline stack probe for Windows, they have their own mechanism.
62286 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62287 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62288 return false;
62289
62290 // If the function specifically requests inline stack probes, emit them.
62291 if (MF.getFunction().hasFnAttribute("probe-stack"))
62292 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62293 "inline-asm";
62294
62295 return false;
62296}
62297
62298/// Returns the name of the symbol used to emit stack probes or the empty
62299/// string if not applicable.
62302 // Inline Stack probes disable stack probe call
62303 if (hasInlineStackProbe(MF))
62304 return "";
62305
62306 // If the function specifically requests stack probes, emit them.
62307 if (MF.getFunction().hasFnAttribute("probe-stack"))
62308 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62309
62310 // Generally, if we aren't on Windows, the platform ABI does not include
62311 // support for stack probes, so don't emit them.
62312 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62313 Subtarget.isTargetMachO() ||
62314 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62315 return "";
62316
62317 // We need a stack probe to conform to the Windows ABI. Choose the right
62318 // symbol.
62319 if (Subtarget.is64Bit())
62320 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62321 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62322}
62323
62324unsigned
62326 // The default stack probe size is 4096 if the function has no stackprobesize
62327 // attribute.
62328 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62329 4096);
62330}
62331
62333 if (ML && ML->isInnermost() &&
62334 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62337}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.