LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true,
4456 bool AllowAVX512 = true) {
4457 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4458 unsigned NumSubs = 1;
4459 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4460 (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
4461 if (VT.getSizeInBits() > 512) {
4462 NumSubs = VT.getSizeInBits() / 512;
4463 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4464 }
4465 } else if (Subtarget.hasAVX2()) {
4466 if (VT.getSizeInBits() > 256) {
4467 NumSubs = VT.getSizeInBits() / 256;
4468 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4469 }
4470 } else {
4471 if (VT.getSizeInBits() > 128) {
4472 NumSubs = VT.getSizeInBits() / 128;
4473 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4474 }
4475 }
4476
4477 if (NumSubs == 1)
4478 return Builder(DAG, DL, Ops);
4479
4481 for (unsigned i = 0; i != NumSubs; ++i) {
4483 for (SDValue Op : Ops) {
4484 EVT OpVT = Op.getValueType();
4485 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4486 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4487 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4488 }
4489 Subs.push_back(Builder(DAG, DL, SubOps));
4490 }
4491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4492}
4493
4494// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4495// targets.
4496static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4498 const X86Subtarget &Subtarget) {
4499 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4500 MVT SVT = VT.getScalarType();
4501
4502 // If we have a 32/64 splatted constant, splat it to DstTy to
4503 // encourage a foldable broadcast'd operand.
4504 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4505 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4506 // AVX512 broadcasts 32/64-bit operands.
4507 // TODO: Support float once getAVX512Node is used by fp-ops.
4508 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4510 return SDValue();
4511 // If we're not widening, don't bother if we're not bitcasting.
4512 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4513 return SDValue();
4515 APInt SplatValue, SplatUndef;
4516 unsigned SplatBitSize;
4517 bool HasAnyUndefs;
4518 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4519 HasAnyUndefs, OpEltSizeInBits) &&
4520 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4521 return DAG.getConstant(SplatValue, DL, DstVT);
4522 }
4523 return SDValue();
4524 };
4525
4526 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4527
4528 MVT DstVT = VT;
4529 if (Widen)
4530 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4531
4532 // Canonicalize src operands.
4533 SmallVector<SDValue> SrcOps(Ops);
4534 for (SDValue &Op : SrcOps) {
4535 MVT OpVT = Op.getSimpleValueType();
4536 // Just pass through scalar operands.
4537 if (!OpVT.isVector())
4538 continue;
4539 assert(OpVT == VT && "Vector type mismatch");
4540
4541 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4542 Op = BroadcastOp;
4543 continue;
4544 }
4545
4546 // Just widen the subvector by inserting into an undef wide vector.
4547 if (Widen)
4548 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4549 }
4550
4551 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4552
4553 // Perform the 512-bit op then extract the bottom subvector.
4554 if (Widen)
4555 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4556 return Res;
4557}
4558
4559/// Insert i1-subvector to i1-vector.
4561 const X86Subtarget &Subtarget) {
4562
4563 SDLoc dl(Op);
4564 SDValue Vec = Op.getOperand(0);
4565 SDValue SubVec = Op.getOperand(1);
4566 SDValue Idx = Op.getOperand(2);
4567 unsigned IdxVal = Op.getConstantOperandVal(2);
4568
4569 // Inserting undef is a nop. We can just return the original vector.
4570 if (SubVec.isUndef())
4571 return Vec;
4572
4573 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4574 return Op;
4575
4576 MVT OpVT = Op.getSimpleValueType();
4577 unsigned NumElems = OpVT.getVectorNumElements();
4578 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4579
4580 // Extend to natively supported kshift.
4581 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4582
4583 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4584 // if necessary.
4585 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4586 // May need to promote to a legal type.
4587 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4588 DAG.getConstant(0, dl, WideOpVT),
4589 SubVec, Idx);
4590 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4591 }
4592
4593 MVT SubVecVT = SubVec.getSimpleValueType();
4594 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4595 assert(IdxVal + SubVecNumElems <= NumElems &&
4596 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4597 "Unexpected index value in INSERT_SUBVECTOR");
4598
4599 SDValue Undef = DAG.getUNDEF(WideOpVT);
4600
4601 if (IdxVal == 0) {
4602 // Zero lower bits of the Vec
4603 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4605 ZeroIdx);
4606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4607 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4608 // Merge them together, SubVec should be zero extended.
4609 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4610 DAG.getConstant(0, dl, WideOpVT),
4611 SubVec, ZeroIdx);
4612 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4614 }
4615
4616 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4617 Undef, SubVec, ZeroIdx);
4618
4619 if (Vec.isUndef()) {
4620 assert(IdxVal != 0 && "Unexpected index");
4621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4624 }
4625
4627 assert(IdxVal != 0 && "Unexpected index");
4628 // If upper elements of Vec are known undef, then just shift into place.
4629 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4630 [](SDValue V) { return V.isUndef(); })) {
4631 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4633 } else {
4634 NumElems = WideOpVT.getVectorNumElements();
4635 unsigned ShiftLeft = NumElems - SubVecNumElems;
4636 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4639 if (ShiftRight != 0)
4640 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4642 }
4643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4644 }
4645
4646 // Simple case when we put subvector in the upper part
4647 if (IdxVal + SubVecNumElems == NumElems) {
4648 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4649 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4650 if (SubVecNumElems * 2 == NumElems) {
4651 // Special case, use legal zero extending insert_subvector. This allows
4652 // isel to optimize when bits are known zero.
4653 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4654 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4655 DAG.getConstant(0, dl, WideOpVT),
4656 Vec, ZeroIdx);
4657 } else {
4658 // Otherwise use explicit shifts to zero the bits.
4659 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4660 Undef, Vec, ZeroIdx);
4661 NumElems = WideOpVT.getVectorNumElements();
4662 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4663 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4664 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4665 }
4666 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4668 }
4669
4670 // Inserting into the middle is more complicated.
4671
4672 NumElems = WideOpVT.getVectorNumElements();
4673
4674 // Widen the vector if needed.
4675 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4676
4677 unsigned ShiftLeft = NumElems - SubVecNumElems;
4678 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4679
4680 // Do an optimization for the most frequently used types.
4681 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4682 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4683 Mask0.flipAllBits();
4684 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4685 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4686 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4687 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4688 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4689 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4690 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4691 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4692
4693 // Reduce to original width if needed.
4694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4695 }
4696
4697 // Clear the upper bits of the subvector and move it to its insert position.
4698 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4699 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4700 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4701 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4702
4703 // Isolate the bits below the insertion point.
4704 unsigned LowShift = NumElems - IdxVal;
4705 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4706 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4707 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4708 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4709
4710 // Isolate the bits after the last inserted bit.
4711 unsigned HighShift = IdxVal + SubVecNumElems;
4712 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4713 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4714 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4715 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4716
4717 // Now OR all 3 pieces together.
4718 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4719 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4720
4721 // Reduce to original width if needed.
4722 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4723}
4724
4726 const SDLoc &dl) {
4727 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4728 EVT SubVT = V1.getValueType();
4729 EVT SubSVT = SubVT.getScalarType();
4730 unsigned SubNumElts = SubVT.getVectorNumElements();
4731 unsigned SubVectorWidth = SubVT.getSizeInBits();
4732 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4733 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4734 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4735}
4736
4737/// Returns a vector of specified type with all bits set.
4738/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4739/// Then bitcast to their original type, ensuring they get CSE'd.
4740static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4741 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4742 "Expected a 128/256/512-bit vector type");
4743 unsigned NumElts = VT.getSizeInBits() / 32;
4744 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4745 return DAG.getBitcast(VT, Vec);
4746}
4747
4748static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4749 SDValue In, SelectionDAG &DAG) {
4750 EVT InVT = In.getValueType();
4751 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4752
4753 // Canonicalize Opcode to general extension version.
4754 switch (Opcode) {
4755 case ISD::ANY_EXTEND:
4757 Opcode = ISD::ANY_EXTEND;
4758 break;
4759 case ISD::SIGN_EXTEND:
4761 Opcode = ISD::SIGN_EXTEND;
4762 break;
4763 case ISD::ZERO_EXTEND:
4765 Opcode = ISD::ZERO_EXTEND;
4766 break;
4767 default:
4768 llvm_unreachable("Unknown extension opcode");
4769 }
4770
4771 // For 256-bit vectors, we only need the lower (128-bit) input half.
4772 // For 512-bit vectors, we only need the lower input half or quarter.
4773 if (InVT.getSizeInBits() > 128) {
4774 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4775 "Expected VTs to be the same size!");
4776 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4777 In = extractSubVector(In, 0, DAG, DL,
4778 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4779 InVT = In.getValueType();
4780 }
4781
4782 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4783 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4784
4785 return DAG.getNode(Opcode, DL, VT, In);
4786}
4787
4788// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4790 SDValue Mask, SelectionDAG &DAG) {
4791 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4792 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4793 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4794}
4795
4797 bool Lo, bool Unary) {
4798 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4799 "Illegal vector type to unpack");
4800 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4801 int NumElts = VT.getVectorNumElements();
4802 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4803 for (int i = 0; i < NumElts; ++i) {
4804 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4805 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4806 Pos += (Unary ? 0 : NumElts * (i % 2));
4807 Pos += (Lo ? 0 : NumEltsInLane / 2);
4808 Mask.push_back(Pos);
4809 }
4810}
4811
4812/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4813/// imposed by AVX and specific to the unary pattern. Example:
4814/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4815/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4817 bool Lo) {
4818 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4819 int NumElts = VT.getVectorNumElements();
4820 for (int i = 0; i < NumElts; ++i) {
4821 int Pos = i / 2;
4822 Pos += (Lo ? 0 : NumElts / 2);
4823 Mask.push_back(Pos);
4824 }
4825}
4826
4827// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4828static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4829 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4832 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4833 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4834 int M = Mask[I];
4835 if (M < 0)
4836 continue;
4837 SDValue V = (M < NumElts) ? V1 : V2;
4838 if (V.isUndef())
4839 continue;
4840 Ops[I] = V.getOperand(M % NumElts);
4841 }
4842 return DAG.getBuildVector(VT, dl, Ops);
4843 }
4844
4845 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4846}
4847
4848/// Returns a vector_shuffle node for an unpackl operation.
4849static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4850 SDValue V1, SDValue V2) {
4852 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4853 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4854}
4855
4856/// Returns a vector_shuffle node for an unpackh operation.
4857static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4858 SDValue V1, SDValue V2) {
4860 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4861 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4862}
4863
4864/// Returns a node that packs the LHS + RHS nodes together at half width.
4865/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4866/// TODO: Add subvector splitting if/when we have a need for it.
4867static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4868 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4869 bool PackHiHalf = false) {
4870 MVT OpVT = LHS.getSimpleValueType();
4871 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4872 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4873 assert(OpVT == RHS.getSimpleValueType() &&
4874 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4875 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4876 "Unexpected PACK operand types");
4877 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4878 "Unexpected PACK result type");
4879
4880 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4881 if (EltSizeInBits == 32) {
4882 SmallVector<int> PackMask;
4883 int Offset = PackHiHalf ? 1 : 0;
4884 int NumElts = VT.getVectorNumElements();
4885 for (int I = 0; I != NumElts; I += 4) {
4886 PackMask.push_back(I + Offset);
4887 PackMask.push_back(I + Offset + 2);
4888 PackMask.push_back(I + Offset + NumElts);
4889 PackMask.push_back(I + Offset + NumElts + 2);
4890 }
4891 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4892 DAG.getBitcast(VT, RHS), PackMask);
4893 }
4894
4895 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4896 if (!PackHiHalf) {
4897 if (UsePackUS &&
4898 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4899 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4901
4902 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4903 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4904 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4905 }
4906
4907 // Fallback to sign/zero extending the requested half and pack.
4908 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4909 if (UsePackUS) {
4910 if (PackHiHalf) {
4911 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4912 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4913 } else {
4914 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4915 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4916 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4917 };
4918 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4919 };
4920
4921 if (!PackHiHalf) {
4922 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4924 }
4925 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4927 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4928}
4929
4930/// Return a vector_shuffle of the specified vector of zero or undef vector.
4931/// This produces a shuffle where the low element of V2 is swizzled into the
4932/// zero/undef vector, landing at element Idx.
4933/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4935 bool IsZero,
4936 const X86Subtarget &Subtarget,
4937 SelectionDAG &DAG) {
4938 MVT VT = V2.getSimpleValueType();
4939 SDValue V1 = IsZero
4940 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4941 int NumElems = VT.getVectorNumElements();
4942 SmallVector<int, 16> MaskVec(NumElems);
4943 for (int i = 0; i != NumElems; ++i)
4944 // If this is the insertion idx, put the low elt of V2 here.
4945 MaskVec[i] = (i == Idx) ? NumElems : i;
4946 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4947}
4948
4950 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4951 Ptr.getOpcode() == X86ISD::WrapperRIP)
4952 Ptr = Ptr.getOperand(0);
4954}
4955
4956// TODO: Add support for non-zero offsets.
4959 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4960 return nullptr;
4961 return CNode->getConstVal();
4962}
4963
4965 if (!Load || !ISD::isNormalLoad(Load))
4966 return nullptr;
4967 return getTargetConstantFromBasePtr(Load->getBasePtr());
4968}
4969
4974
4975const Constant *
4977 assert(LD && "Unexpected null LoadSDNode");
4978 return getTargetConstantFromNode(LD);
4979}
4980
4982 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4983 SDValue Cond = N->getOperand(0);
4984 SDValue RHS = N->getOperand(2);
4985 EVT CondVT = Cond.getValueType();
4986 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4987 CondVT.getVectorElementType() == MVT::i1 &&
4988 ISD::isBuildVectorAllZeros(RHS.getNode());
4989}
4990
4991// Extract raw constant bits from constant pools.
4992static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4993 APInt &UndefElts,
4994 SmallVectorImpl<APInt> &EltBits,
4995 bool AllowWholeUndefs = true,
4996 bool AllowPartialUndefs = false) {
4997 assert(EltBits.empty() && "Expected an empty EltBits vector");
4998
5000
5001 EVT VT = Op.getValueType();
5002 unsigned SizeInBits = VT.getSizeInBits();
5003 unsigned NumElts = SizeInBits / EltSizeInBits;
5004
5005 // Can't split constant.
5006 if ((SizeInBits % EltSizeInBits) != 0)
5007 return false;
5008
5009 // Bitcast a source array of element bits to the target size.
5010 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5011 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5012 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5013 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5014 "Constant bit sizes don't match");
5015
5016 // Don't split if we don't allow undef bits.
5017 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5018 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5019 return false;
5020
5021 // If we're already the right size, don't bother bitcasting.
5022 if (NumSrcElts == NumElts) {
5023 UndefElts = UndefSrcElts;
5024 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5025 return true;
5026 }
5027
5028 // Extract all the undef/constant element data and pack into single bitsets.
5029 APInt UndefBits(SizeInBits, 0);
5030 APInt MaskBits(SizeInBits, 0);
5031
5032 for (unsigned i = 0; i != NumSrcElts; ++i) {
5033 unsigned BitOffset = i * SrcEltSizeInBits;
5034 if (UndefSrcElts[i])
5035 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5036 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5037 }
5038
5039 // Split the undef/constant single bitset data into the target elements.
5040 UndefElts = APInt(NumElts, 0);
5041 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5042
5043 for (unsigned i = 0; i != NumElts; ++i) {
5044 unsigned BitOffset = i * EltSizeInBits;
5045 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5046
5047 // Only treat an element as UNDEF if all bits are UNDEF.
5048 if (UndefEltBits.isAllOnes()) {
5049 if (!AllowWholeUndefs)
5050 return false;
5051 UndefElts.setBit(i);
5052 continue;
5053 }
5054
5055 // If only some bits are UNDEF then treat them as zero (or bail if not
5056 // supported).
5057 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5058 return false;
5059
5060 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5061 }
5062 return true;
5063 };
5064
5065 // Collect constant bits and insert into mask/undef bit masks.
5066 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5067 unsigned UndefBitIndex) {
5068 if (!Cst)
5069 return false;
5070 if (isa<UndefValue>(Cst)) {
5071 Undefs.setBit(UndefBitIndex);
5072 return true;
5073 }
5074 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5075 Mask = CInt->getValue();
5076 return true;
5077 }
5078 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5079 Mask = CFP->getValueAPF().bitcastToAPInt();
5080 return true;
5081 }
5082 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5083 Type *Ty = CDS->getType();
5084 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5085 Type *EltTy = CDS->getElementType();
5086 bool IsInteger = EltTy->isIntegerTy();
5087 bool IsFP =
5088 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5089 if (!IsInteger && !IsFP)
5090 return false;
5091 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5092 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5093 if (IsInteger)
5094 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5095 else
5096 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5097 I * EltBits);
5098 return true;
5099 }
5100 return false;
5101 };
5102
5103 // Handle UNDEFs.
5104 if (Op.isUndef()) {
5105 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5106 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5107 return CastBitData(UndefSrcElts, SrcEltBits);
5108 }
5109
5110 // Extract scalar constant bits.
5111 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5112 APInt UndefSrcElts = APInt::getZero(1);
5113 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5114 return CastBitData(UndefSrcElts, SrcEltBits);
5115 }
5116 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5117 APInt UndefSrcElts = APInt::getZero(1);
5118 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5119 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122
5123 // Extract constant bits from build vector.
5124 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5125 BitVector Undefs;
5126 SmallVector<APInt> SrcEltBits;
5127 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5128 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5129 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5130 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5131 if (Undefs[I])
5132 UndefSrcElts.setBit(I);
5133 return CastBitData(UndefSrcElts, SrcEltBits);
5134 }
5135 }
5136
5137 // Extract constant bits from constant pool vector.
5138 if (auto *Cst = getTargetConstantFromNode(Op)) {
5139 Type *CstTy = Cst->getType();
5140 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5141 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5142 return false;
5143
5144 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5145 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5146 if ((SizeInBits % SrcEltSizeInBits) != 0)
5147 return false;
5148
5149 APInt UndefSrcElts(NumSrcElts, 0);
5150 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5151 for (unsigned i = 0; i != NumSrcElts; ++i)
5152 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5153 UndefSrcElts, i))
5154 return false;
5155
5156 return CastBitData(UndefSrcElts, SrcEltBits);
5157 }
5158
5159 // Extract constant bits from a broadcasted constant pool scalar.
5160 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5161 EltSizeInBits <= VT.getScalarSizeInBits()) {
5162 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5163 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5164 return false;
5165
5166 SDValue Ptr = MemIntr->getBasePtr();
5168 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5169 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5170
5171 APInt UndefSrcElts(NumSrcElts, 0);
5172 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5173 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5174 if (UndefSrcElts[0])
5175 UndefSrcElts.setBits(0, NumSrcElts);
5176 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5177 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5178 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5179 return CastBitData(UndefSrcElts, SrcEltBits);
5180 }
5181 }
5182 }
5183
5184 // Extract constant bits from a subvector broadcast.
5185 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5186 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5187 SDValue Ptr = MemIntr->getBasePtr();
5188 // The source constant may be larger than the subvector broadcast,
5189 // ensure we extract the correct subvector constants.
5190 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5191 Type *CstTy = Cst->getType();
5192 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5193 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5194 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5195 (SizeInBits % SubVecSizeInBits) != 0)
5196 return false;
5197 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5198 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5199 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5200 APInt UndefSubElts(NumSubElts, 0);
5201 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5202 APInt(CstEltSizeInBits, 0));
5203 for (unsigned i = 0; i != NumSubElts; ++i) {
5204 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5205 UndefSubElts, i))
5206 return false;
5207 for (unsigned j = 1; j != NumSubVecs; ++j)
5208 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5209 }
5210 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5211 UndefSubElts);
5212 return CastBitData(UndefSubElts, SubEltBits);
5213 }
5214 }
5215
5216 // Extract a rematerialized scalar constant insertion.
5217 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5218 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5219 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5220 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222
5223 APInt UndefSrcElts(NumSrcElts, 0);
5224 SmallVector<APInt, 64> SrcEltBits;
5225 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5226 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5227 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5228 return CastBitData(UndefSrcElts, SrcEltBits);
5229 }
5230
5231 // Insert constant bits from a base and sub vector sources.
5232 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5233 // If bitcasts to larger elements we might lose track of undefs - don't
5234 // allow any to be safe.
5235 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5236 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5237
5238 APInt UndefSrcElts, UndefSubElts;
5239 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5240 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5241 UndefSubElts, EltSubBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs) &&
5244 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5245 UndefSrcElts, EltSrcBits,
5246 AllowWholeUndefs && AllowUndefs,
5247 AllowPartialUndefs && AllowUndefs)) {
5248 unsigned BaseIdx = Op.getConstantOperandVal(2);
5249 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5250 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5251 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5252 return CastBitData(UndefSrcElts, EltSrcBits);
5253 }
5254 }
5255
5256 // Extract constant bits from a subvector's source.
5257 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5258 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5259 EltBits, AllowWholeUndefs,
5260 AllowPartialUndefs)) {
5261 EVT SrcVT = Op.getOperand(0).getValueType();
5262 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5263 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5264 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5265 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5266 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5268 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5269
5270 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5271 if ((BaseIdx + NumSubElts) != NumSrcElts)
5272 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5273 if (BaseIdx != 0)
5274 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5275 return true;
5276 }
5277
5278 // Extract constant bits from shuffle node sources.
5279 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5280 // TODO - support shuffle through bitcasts.
5281 if (EltSizeInBits != VT.getScalarSizeInBits())
5282 return false;
5283
5284 ArrayRef<int> Mask = SVN->getMask();
5285 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5286 llvm::any_of(Mask, [](int M) { return M < 0; }))
5287 return false;
5288
5289 APInt UndefElts0, UndefElts1;
5290 SmallVector<APInt, 32> EltBits0, EltBits1;
5291 if (isAnyInRange(Mask, 0, NumElts) &&
5292 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5293 UndefElts0, EltBits0, AllowWholeUndefs,
5294 AllowPartialUndefs))
5295 return false;
5296 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5297 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5298 UndefElts1, EltBits1, AllowWholeUndefs,
5299 AllowPartialUndefs))
5300 return false;
5301
5302 UndefElts = APInt::getZero(NumElts);
5303 for (int i = 0; i != (int)NumElts; ++i) {
5304 int M = Mask[i];
5305 if (M < 0) {
5306 UndefElts.setBit(i);
5307 EltBits.push_back(APInt::getZero(EltSizeInBits));
5308 } else if (M < (int)NumElts) {
5309 if (UndefElts0[M])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits0[M]);
5312 } else {
5313 if (UndefElts1[M - NumElts])
5314 UndefElts.setBit(i);
5315 EltBits.push_back(EltBits1[M - NumElts]);
5316 }
5317 }
5318 return true;
5319 }
5320
5321 return false;
5322}
5323
5324namespace llvm {
5325namespace X86 {
5326bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5327 APInt UndefElts;
5328 SmallVector<APInt, 16> EltBits;
5330 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5331 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5332 int SplatIndex = -1;
5333 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5334 if (UndefElts[i])
5335 continue;
5336 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5337 SplatIndex = -1;
5338 break;
5339 }
5340 SplatIndex = i;
5341 }
5342 if (0 <= SplatIndex) {
5343 SplatVal = EltBits[SplatIndex];
5344 return true;
5345 }
5346 }
5347
5348 return false;
5349}
5350
5351int getRoundingModeX86(unsigned RM) {
5352 switch (static_cast<::llvm::RoundingMode>(RM)) {
5353 // clang-format off
5354 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5355 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5356 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5357 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5358 default:
5359 return X86::rmInvalid; // Invalid rounding mode
5360 }
5361}
5362
5363} // namespace X86
5364} // namespace llvm
5365
5367 unsigned MaskEltSizeInBits,
5369 APInt &UndefElts) {
5370 // Extract the raw target constant bits.
5371 SmallVector<APInt, 64> EltBits;
5372 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5373 EltBits, /* AllowWholeUndefs */ true,
5374 /* AllowPartialUndefs */ false))
5375 return false;
5376
5377 // Insert the extracted elements into the mask.
5378 for (const APInt &Elt : EltBits)
5379 RawMask.push_back(Elt.getZExtValue());
5380
5381 return true;
5382}
5383
5384static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5385 bool AllowUndefs) {
5386 APInt UndefElts;
5387 SmallVector<APInt, 64> EltBits;
5388 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5389 /*AllowWholeUndefs*/ AllowUndefs,
5390 /*AllowPartialUndefs*/ false))
5391 return false;
5392
5393 bool IsPow2OrUndef = true;
5394 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5395 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5396 return IsPow2OrUndef;
5397}
5398
5399// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5401 // TODO: don't always ignore oneuse constraints.
5402 V = peekThroughBitcasts(V);
5403 EVT VT = V.getValueType();
5404
5405 // Match not(xor X, -1) -> X.
5406 if (V.getOpcode() == ISD::XOR &&
5407 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5408 isAllOnesConstant(V.getOperand(1))))
5409 return V.getOperand(0);
5410
5411 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5412 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5413 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5414 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5415 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5417 V.getOperand(1));
5418 }
5419 }
5420
5421 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5422 if (V.getOpcode() == X86ISD::PCMPGT &&
5423 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5424 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5425 V.getOperand(0).hasOneUse()) {
5426 APInt UndefElts;
5427 SmallVector<APInt> EltBits;
5428 if (getTargetConstantBitsFromNode(V.getOperand(0),
5429 V.getScalarValueSizeInBits(), UndefElts,
5430 EltBits) &&
5431 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5432 // Don't fold min_signed_value -> (min_signed_value - 1)
5433 bool MinSigned = false;
5434 for (APInt &Elt : EltBits) {
5435 MinSigned |= Elt.isMinSignedValue();
5436 Elt -= 1;
5437 }
5438 if (!MinSigned) {
5439 SDLoc DL(V);
5440 MVT VT = V.getSimpleValueType();
5441 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5442 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5443 }
5444 }
5445 }
5446
5447 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5449 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5450 for (SDValue &CatOp : CatOps) {
5451 SDValue NotCat = IsNOT(CatOp, DAG);
5452 if (!NotCat)
5453 return SDValue();
5454 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5455 }
5456 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5457 }
5458
5459 // Match not(or(not(X),not(Y))) -> and(X, Y).
5460 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5461 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5462 // TODO: Handle cases with single NOT operand -> ANDNP
5463 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5464 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5465 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5466 DAG.getBitcast(VT, Op1));
5467 }
5468
5469 return SDValue();
5470}
5471
5472/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5473/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5474/// Note: This ignores saturation, so inputs must be checked first.
5476 bool Unary, unsigned NumStages = 1) {
5477 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5478 unsigned NumElts = VT.getVectorNumElements();
5479 unsigned NumLanes = VT.getSizeInBits() / 128;
5480 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5481 unsigned Offset = Unary ? 0 : NumElts;
5482 unsigned Repetitions = 1u << (NumStages - 1);
5483 unsigned Increment = 1u << NumStages;
5484 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5485
5486 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5487 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5488 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5489 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5490 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5491 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5492 }
5493 }
5494}
5495
5496// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5497static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5498 APInt &DemandedLHS, APInt &DemandedRHS) {
5499 int NumLanes = VT.getSizeInBits() / 128;
5500 int NumElts = DemandedElts.getBitWidth();
5501 int NumInnerElts = NumElts / 2;
5502 int NumEltsPerLane = NumElts / NumLanes;
5503 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5504
5505 DemandedLHS = APInt::getZero(NumInnerElts);
5506 DemandedRHS = APInt::getZero(NumInnerElts);
5507
5508 // Map DemandedElts to the packed operands.
5509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5510 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5511 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5512 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5513 if (DemandedElts[OuterIdx])
5514 DemandedLHS.setBit(InnerIdx);
5515 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5516 DemandedRHS.setBit(InnerIdx);
5517 }
5518 }
5519}
5520
5521// Split the demanded elts of a HADD/HSUB node between its operands.
5522static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5523 APInt &DemandedLHS, APInt &DemandedRHS) {
5525 DemandedLHS, DemandedRHS);
5526 DemandedLHS |= DemandedLHS << 1;
5527 DemandedRHS |= DemandedRHS << 1;
5528}
5529
5530/// Calculates the shuffle mask corresponding to the target-specific opcode.
5531/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5532/// operands in \p Ops, and returns true.
5533/// Sets \p IsUnary to true if only one source is used. Note that this will set
5534/// IsUnary for shuffles which use a single input multiple times, and in those
5535/// cases it will adjust the mask to only have indices within that single input.
5536/// It is an error to call this with non-empty Mask/Ops vectors.
5537static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5539 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5540 if (!isTargetShuffle(N.getOpcode()))
5541 return false;
5542
5543 MVT VT = N.getSimpleValueType();
5544 unsigned NumElems = VT.getVectorNumElements();
5545 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 APInt RawUndefs;
5548 uint64_t ImmN;
5549
5550 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5551 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5552
5553 IsUnary = false;
5554 bool IsFakeUnary = false;
5555 switch (N.getOpcode()) {
5556 case X86ISD::BLENDI:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeBLENDMask(NumElems, ImmN, Mask);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::SHUFP:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5566 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5567 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5568 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5569 break;
5570 case X86ISD::INSERTPS:
5571 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5572 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5573 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5574 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5575 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5576 break;
5577 case X86ISD::EXTRQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5580 isa<ConstantSDNode>(N.getOperand(2))) {
5581 int BitLen = N.getConstantOperandVal(1);
5582 int BitIdx = N.getConstantOperandVal(2);
5583 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5584 IsUnary = true;
5585 }
5586 break;
5587 case X86ISD::INSERTQI:
5588 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5590 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5591 isa<ConstantSDNode>(N.getOperand(3))) {
5592 int BitLen = N.getConstantOperandVal(2);
5593 int BitIdx = N.getConstantOperandVal(3);
5594 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5595 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5596 }
5597 break;
5598 case X86ISD::UNPCKH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5602 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5603 break;
5604 case X86ISD::UNPCKL:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::MOVHLPS:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVHLPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVLHPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVLHPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::VALIGN:
5623 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5624 "Only 32-bit and 64-bit elements are supported!");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodeVALIGNMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::PALIGNR:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5637 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5638 DecodePALIGNRMask(NumElems, ImmN, Mask);
5639 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5640 Ops.push_back(N.getOperand(1));
5641 Ops.push_back(N.getOperand(0));
5642 break;
5643 case X86ISD::VSHLDQ:
5644 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSLLDQMask(NumElems, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::VSRLDQ:
5651 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5652 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5653 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5654 DecodePSRLDQMask(NumElems, ImmN, Mask);
5655 IsUnary = true;
5656 break;
5657 case X86ISD::PSHUFD:
5658 case X86ISD::VPERMILPI:
5659 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5660 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5661 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5662 IsUnary = true;
5663 break;
5664 case X86ISD::PSHUFHW:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFLW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::VZEXT_MOVL:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 DecodeZeroMoveLowMask(NumElems, Mask);
5679 IsUnary = true;
5680 break;
5681 case X86ISD::VBROADCAST:
5682 // We only decode broadcasts of same-sized vectors, peeking through to
5683 // extracted subvectors is likely to cause hasOneUse issues with
5684 // SimplifyDemandedBits etc.
5685 if (N.getOperand(0).getValueType() == VT) {
5686 DecodeVectorBroadcast(NumElems, Mask);
5687 IsUnary = true;
5688 break;
5689 }
5690 return false;
5691 case X86ISD::VPERMILPV: {
5692 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5693 IsUnary = true;
5694 SDValue MaskNode = N.getOperand(1);
5695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5696 RawUndefs)) {
5697 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5698 break;
5699 }
5700 return false;
5701 }
5702 case X86ISD::PSHUFB: {
5703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5704 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5705 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5706 IsUnary = true;
5707 SDValue MaskNode = N.getOperand(1);
5708 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5709 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5710 break;
5711 }
5712 return false;
5713 }
5714 case X86ISD::VPERMI:
5715 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERMMask(NumElems, ImmN, Mask);
5718 IsUnary = true;
5719 break;
5720 case X86ISD::MOVSS:
5721 case X86ISD::MOVSD:
5722 case X86ISD::MOVSH:
5723 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5724 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5725 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5726 break;
5727 case X86ISD::VPERM2X128:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5730 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5731 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5732 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5733 break;
5734 case X86ISD::SHUF128:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5737 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5738 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5739 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5740 break;
5741 case X86ISD::MOVSLDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVSLDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::MOVSHDUP:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 DecodeMOVSHDUPMask(NumElems, Mask);
5749 IsUnary = true;
5750 break;
5751 case X86ISD::MOVDDUP:
5752 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5753 DecodeMOVDDUPMask(NumElems, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::VPERMIL2: {
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5759 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5760 SDValue MaskNode = N.getOperand(2);
5761 SDValue CtrlNode = N.getOperand(3);
5762 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5763 unsigned CtrlImm = CtrlOp->getZExtValue();
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5765 RawUndefs)) {
5766 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5767 Mask);
5768 break;
5769 }
5770 }
5771 return false;
5772 }
5773 case X86ISD::VPPERM: {
5774 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5777 SDValue MaskNode = N.getOperand(2);
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5779 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 case X86ISD::VPERMV: {
5785 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5786 IsUnary = true;
5787 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5788 Ops.push_back(N.getOperand(1));
5789 SDValue MaskNode = N.getOperand(0);
5790 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5791 RawUndefs)) {
5792 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5793 break;
5794 }
5795 return false;
5796 }
5797 case X86ISD::VPERMV3: {
5798 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5800 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5801 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5802 Ops.push_back(N.getOperand(0));
5803 Ops.push_back(N.getOperand(2));
5804 SDValue MaskNode = N.getOperand(1);
5805 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5806 RawUndefs)) {
5807 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5808 break;
5809 }
5810 return false;
5811 }
5812 default:
5813 llvm_unreachable("unknown target shuffle node");
5814 }
5815
5816 // Empty mask indicates the decode failed.
5817 if (Mask.empty())
5818 return false;
5819
5820 // Check if we're getting a shuffle mask with zero'd elements.
5821 if (!AllowSentinelZero && isAnyZero(Mask))
5822 return false;
5823
5824 // If we have a fake unary shuffle, the shuffle mask is spread across two
5825 // inputs that are actually the same node. Re-map the mask to always point
5826 // into the first input.
5827 if (IsFakeUnary)
5828 for (int &M : Mask)
5829 if (M >= (int)Mask.size())
5830 M -= Mask.size();
5831
5832 // If we didn't already add operands in the opcode-specific code, default to
5833 // adding 1 or 2 operands starting at 0.
5834 if (Ops.empty()) {
5835 Ops.push_back(N.getOperand(0));
5836 if (!IsUnary || IsFakeUnary)
5837 Ops.push_back(N.getOperand(1));
5838 }
5839
5840 return true;
5841}
5842
5843// Wrapper for getTargetShuffleMask with InUnary;
5844static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5846 SmallVectorImpl<int> &Mask) {
5847 bool IsUnary;
5848 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5849}
5850
5851/// Compute whether each element of a shuffle is zeroable.
5852///
5853/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5854/// Either it is an undef element in the shuffle mask, the element of the input
5855/// referenced is undef, or the element of the input referenced is known to be
5856/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5857/// as many lanes with this technique as possible to simplify the remaining
5858/// shuffle.
5860 SDValue V1, SDValue V2,
5861 APInt &KnownUndef, APInt &KnownZero) {
5862 int Size = Mask.size();
5863 KnownUndef = KnownZero = APInt::getZero(Size);
5864
5865 V1 = peekThroughBitcasts(V1);
5866 V2 = peekThroughBitcasts(V2);
5867
5868 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5869 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5870
5871 int VectorSizeInBits = V1.getValueSizeInBits();
5872 int ScalarSizeInBits = VectorSizeInBits / Size;
5873 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5874
5875 for (int i = 0; i < Size; ++i) {
5876 int M = Mask[i];
5877 // Handle the easy cases.
5878 if (M < 0) {
5879 KnownUndef.setBit(i);
5880 continue;
5881 }
5882 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5883 KnownZero.setBit(i);
5884 continue;
5885 }
5886
5887 // Determine shuffle input and normalize the mask.
5888 SDValue V = M < Size ? V1 : V2;
5889 M %= Size;
5890
5891 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5892 if (V.getOpcode() != ISD::BUILD_VECTOR)
5893 continue;
5894
5895 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5896 // the (larger) source element must be UNDEF/ZERO.
5897 if ((Size % V.getNumOperands()) == 0) {
5898 int Scale = Size / V->getNumOperands();
5899 SDValue Op = V.getOperand(M / Scale);
5900 if (Op.isUndef())
5901 KnownUndef.setBit(i);
5902 if (X86::isZeroNode(Op))
5903 KnownZero.setBit(i);
5904 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5905 APInt Val = Cst->getAPIntValue();
5906 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5907 if (Val == 0)
5908 KnownZero.setBit(i);
5909 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5910 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5911 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5912 if (Val == 0)
5913 KnownZero.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // If the BUILD_VECTOR has more elements then all the (smaller) source
5919 // elements must be UNDEF or ZERO.
5920 if ((V.getNumOperands() % Size) == 0) {
5921 int Scale = V->getNumOperands() / Size;
5922 bool AllUndef = true;
5923 bool AllZero = true;
5924 for (int j = 0; j < Scale; ++j) {
5925 SDValue Op = V.getOperand((M * Scale) + j);
5926 AllUndef &= Op.isUndef();
5927 AllZero &= X86::isZeroNode(Op);
5928 }
5929 if (AllUndef)
5930 KnownUndef.setBit(i);
5931 if (AllZero)
5932 KnownZero.setBit(i);
5933 continue;
5934 }
5935 }
5936}
5937
5938/// Decode a target shuffle mask and inputs and see if any values are
5939/// known to be undef or zero from their inputs.
5940/// Returns true if the target shuffle mask was decoded.
5941/// FIXME: Merge this with computeZeroableShuffleElements?
5944 APInt &KnownUndef, APInt &KnownZero) {
5945 bool IsUnary;
5946 if (!isTargetShuffle(N.getOpcode()))
5947 return false;
5948
5949 MVT VT = N.getSimpleValueType();
5950 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5951 return false;
5952
5953 int Size = Mask.size();
5954 SDValue V1 = Ops[0];
5955 SDValue V2 = IsUnary ? V1 : Ops[1];
5956 KnownUndef = KnownZero = APInt::getZero(Size);
5957
5958 V1 = peekThroughBitcasts(V1);
5959 V2 = peekThroughBitcasts(V2);
5960
5961 assert((VT.getSizeInBits() % Size) == 0 &&
5962 "Illegal split of shuffle value type");
5963 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5964
5965 // Extract known constant input data.
5966 APInt UndefSrcElts[2];
5967 SmallVector<APInt, 32> SrcEltBits[2];
5968 bool IsSrcConstant[2] = {
5969 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5970 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false),
5972 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5973 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5974 /*AllowPartialUndefs*/ false)};
5975
5976 for (int i = 0; i < Size; ++i) {
5977 int M = Mask[i];
5978
5979 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5980 if (M < 0) {
5981 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5982 if (SM_SentinelUndef == M)
5983 KnownUndef.setBit(i);
5984 if (SM_SentinelZero == M)
5985 KnownZero.setBit(i);
5986 continue;
5987 }
5988
5989 // Determine shuffle input and normalize the mask.
5990 unsigned SrcIdx = M / Size;
5991 SDValue V = M < Size ? V1 : V2;
5992 M %= Size;
5993
5994 // We are referencing an UNDEF input.
5995 if (V.isUndef()) {
5996 KnownUndef.setBit(i);
5997 continue;
5998 }
5999
6000 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6001 // TODO: We currently only set UNDEF for integer types - floats use the same
6002 // registers as vectors and many of the scalar folded loads rely on the
6003 // SCALAR_TO_VECTOR pattern.
6004 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6005 (Size % V.getValueType().getVectorNumElements()) == 0) {
6006 int Scale = Size / V.getValueType().getVectorNumElements();
6007 int Idx = M / Scale;
6008 if (Idx != 0 && !VT.isFloatingPoint())
6009 KnownUndef.setBit(i);
6010 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6011 KnownZero.setBit(i);
6012 continue;
6013 }
6014
6015 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6016 // base vectors.
6017 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6018 SDValue Vec = V.getOperand(0);
6019 int NumVecElts = Vec.getValueType().getVectorNumElements();
6020 if (Vec.isUndef() && Size == NumVecElts) {
6021 int Idx = V.getConstantOperandVal(2);
6022 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6023 if (M < Idx || (Idx + NumSubElts) <= M)
6024 KnownUndef.setBit(i);
6025 }
6026 continue;
6027 }
6028
6029 // Attempt to extract from the source's constant bits.
6030 if (IsSrcConstant[SrcIdx]) {
6031 if (UndefSrcElts[SrcIdx][M])
6032 KnownUndef.setBit(i);
6033 else if (SrcEltBits[SrcIdx][M] == 0)
6034 KnownZero.setBit(i);
6035 }
6036 }
6037
6038 assert(VT.getVectorNumElements() == (unsigned)Size &&
6039 "Different mask size from vector size!");
6040 return true;
6041}
6042
6043// Replace target shuffle mask elements with known undef/zero sentinels.
6045 const APInt &KnownUndef,
6046 const APInt &KnownZero,
6047 bool ResolveKnownZeros= true) {
6048 unsigned NumElts = Mask.size();
6049 assert(KnownUndef.getBitWidth() == NumElts &&
6050 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6051
6052 for (unsigned i = 0; i != NumElts; ++i) {
6053 if (KnownUndef[i])
6054 Mask[i] = SM_SentinelUndef;
6055 else if (ResolveKnownZeros && KnownZero[i])
6056 Mask[i] = SM_SentinelZero;
6057 }
6058}
6059
6060// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6062 APInt &KnownUndef,
6063 APInt &KnownZero) {
6064 unsigned NumElts = Mask.size();
6065 KnownUndef = KnownZero = APInt::getZero(NumElts);
6066
6067 for (unsigned i = 0; i != NumElts; ++i) {
6068 int M = Mask[i];
6069 if (SM_SentinelUndef == M)
6070 KnownUndef.setBit(i);
6071 if (SM_SentinelZero == M)
6072 KnownZero.setBit(i);
6073 }
6074}
6075
6076// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6078 SDValue Cond, bool IsBLENDV = false) {
6079 EVT CondVT = Cond.getValueType();
6080 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6081 unsigned NumElts = CondVT.getVectorNumElements();
6082
6083 APInt UndefElts;
6084 SmallVector<APInt, 32> EltBits;
6085 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6086 /*AllowWholeUndefs*/ true,
6087 /*AllowPartialUndefs*/ false))
6088 return false;
6089
6090 Mask.resize(NumElts, SM_SentinelUndef);
6091
6092 for (int i = 0; i != (int)NumElts; ++i) {
6093 Mask[i] = i;
6094 // Arbitrarily choose from the 2nd operand if the select condition element
6095 // is undef.
6096 // TODO: Can we do better by matching patterns such as even/odd?
6097 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6098 (IsBLENDV && EltBits[i].isNonNegative()))
6099 Mask[i] += NumElts;
6100 }
6101
6102 return true;
6103}
6104
6105// Forward declaration (for getFauxShuffleMask recursive check).
6106static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts);
6111
6112// Attempt to decode ops that could be represented as a shuffle mask.
6113// The decoded shuffle mask may contain a different number of elements to the
6114// destination value type.
6115// TODO: Merge into getTargetShuffleInputs()
6116static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6119 const SelectionDAG &DAG, unsigned Depth,
6120 bool ResolveKnownElts) {
6121 Mask.clear();
6122 Ops.clear();
6123
6124 MVT VT = N.getSimpleValueType();
6125 unsigned NumElts = VT.getVectorNumElements();
6126 unsigned NumSizeInBits = VT.getSizeInBits();
6127 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6128 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6129 return false;
6130 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6131 unsigned NumSizeInBytes = NumSizeInBits / 8;
6132 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6133
6134 unsigned Opcode = N.getOpcode();
6135 switch (Opcode) {
6136 case ISD::VECTOR_SHUFFLE: {
6137 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6138 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6139 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6140 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6141 Ops.push_back(N.getOperand(0));
6142 Ops.push_back(N.getOperand(1));
6143 return true;
6144 }
6145 return false;
6146 }
6147 case ISD::AND:
6148 case X86ISD::ANDNP: {
6149 // Attempt to decode as a per-byte mask.
6150 APInt UndefElts;
6151 SmallVector<APInt, 32> EltBits;
6152 SDValue N0 = N.getOperand(0);
6153 SDValue N1 = N.getOperand(1);
6154 bool IsAndN = (X86ISD::ANDNP == Opcode);
6155 uint64_t ZeroMask = IsAndN ? 255 : 0;
6156 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6157 /*AllowWholeUndefs*/ false,
6158 /*AllowPartialUndefs*/ false))
6159 return false;
6160 // We can't assume an undef src element gives an undef dst - the other src
6161 // might be zero.
6162 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6163 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6164 const APInt &ByteBits = EltBits[i];
6165 if (ByteBits != 0 && ByteBits != 255)
6166 return false;
6167 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6168 }
6169 Ops.push_back(IsAndN ? N1 : N0);
6170 return true;
6171 }
6172 case ISD::OR: {
6173 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6174 // is a valid shuffle index.
6175 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6176 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6177 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6178 return false;
6179
6180 SmallVector<int, 64> SrcMask0, SrcMask1;
6181 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6184 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6185 Depth + 1, true) ||
6186 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6187 Depth + 1, true))
6188 return false;
6189
6190 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6191 SmallVector<int, 64> Mask0, Mask1;
6192 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6193 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6194 for (int i = 0; i != (int)MaskSize; ++i) {
6195 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6196 // loops converting between OR and BLEND shuffles due to
6197 // canWidenShuffleElements merging away undef elements, meaning we
6198 // fail to recognise the OR as the undef element isn't known zero.
6199 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6200 Mask.push_back(SM_SentinelZero);
6201 else if (Mask1[i] == SM_SentinelZero)
6202 Mask.push_back(i);
6203 else if (Mask0[i] == SM_SentinelZero)
6204 Mask.push_back(i + MaskSize);
6205 else
6206 return false;
6207 }
6208 Ops.push_back(N.getOperand(0));
6209 Ops.push_back(N.getOperand(1));
6210 return true;
6211 }
6212 case ISD::CONCAT_VECTORS: {
6213 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6214 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6215 if (NumBitsPerElt == 64) {
6216 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6217 for (unsigned M = 0; M != NumSubElts; ++M)
6218 Mask.push_back((I * NumElts) + M);
6219 Ops.push_back(N.getOperand(I));
6220 }
6221 return true;
6222 }
6223 return false;
6224 }
6225 case ISD::INSERT_SUBVECTOR: {
6226 SDValue Src = N.getOperand(0);
6227 SDValue Sub = N.getOperand(1);
6228 EVT SubVT = Sub.getValueType();
6229 unsigned NumSubElts = SubVT.getVectorNumElements();
6230 uint64_t InsertIdx = N.getConstantOperandVal(2);
6231 // Subvector isn't demanded - just return the base vector.
6232 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.end(), 0);
6235 Ops.push_back(Src);
6236 return true;
6237 }
6238 // Handle CONCAT(SUB0, SUB1).
6239 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6240 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6241 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6242 Src.getOperand(0).isUndef() &&
6243 Src.getOperand(1).getValueType() == SubVT &&
6244 Src.getConstantOperandVal(2) == 0 &&
6245 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6246 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6247 Mask.resize(NumElts);
6248 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6249 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6250 Ops.push_back(Src.getOperand(1));
6251 Ops.push_back(Sub);
6252 return true;
6253 }
6254 if (!N->isOnlyUserOf(Sub.getNode()))
6255 return false;
6256
6257 SmallVector<int, 64> SubMask;
6258 SmallVector<SDValue, 2> SubInputs;
6260 EVT SubSrcVT = SubSrc.getValueType();
6261 if (!SubSrcVT.isVector())
6262 return false;
6263
6264 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6265 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6266 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6267 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6268 SDValue SubSrcSrc = SubSrc.getOperand(0);
6269 unsigned NumSubSrcSrcElts =
6270 SubSrcSrc.getValueType().getVectorNumElements();
6271 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6272 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6273 "Subvector valuetype mismatch");
6274 InsertIdx *= (MaxElts / NumElts);
6275 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6276 NumSubElts *= (MaxElts / NumElts);
6277 bool SrcIsUndef = Src.isUndef();
6278 for (int i = 0; i != (int)MaxElts; ++i)
6279 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6280 for (int i = 0; i != (int)NumSubElts; ++i)
6281 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6282 if (!SrcIsUndef)
6283 Ops.push_back(Src);
6284 Ops.push_back(SubSrcSrc);
6285 return true;
6286 }
6287
6288 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6289 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6290 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6291 Depth + 1, ResolveKnownElts))
6292 return false;
6293
6294 // Subvector shuffle inputs must not be larger than the subvector.
6295 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6296 return SubVT.getFixedSizeInBits() <
6297 SubInput.getValueSizeInBits().getFixedValue();
6298 }))
6299 return false;
6300
6301 if (SubMask.size() != NumSubElts) {
6302 assert(((SubMask.size() % NumSubElts) == 0 ||
6303 (NumSubElts % SubMask.size()) == 0) &&
6304 "Illegal submask scale");
6305 if ((NumSubElts % SubMask.size()) == 0) {
6306 int Scale = NumSubElts / SubMask.size();
6307 SmallVector<int, 64> ScaledSubMask;
6308 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6309 SubMask = ScaledSubMask;
6310 } else {
6311 int Scale = SubMask.size() / NumSubElts;
6312 NumSubElts = SubMask.size();
6313 NumElts *= Scale;
6314 InsertIdx *= Scale;
6315 }
6316 }
6317 Ops.push_back(Src);
6318 Ops.append(SubInputs.begin(), SubInputs.end());
6319 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6320 Mask.append(NumElts, SM_SentinelZero);
6321 else
6322 for (int i = 0; i != (int)NumElts; ++i)
6323 Mask.push_back(i);
6324 for (int i = 0; i != (int)NumSubElts; ++i) {
6325 int M = SubMask[i];
6326 if (0 <= M) {
6327 int InputIdx = M / NumSubElts;
6328 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6329 }
6330 Mask[i + InsertIdx] = M;
6331 }
6332 return true;
6333 }
6334 case X86ISD::PINSRB:
6335 case X86ISD::PINSRW:
6338 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6339 // vector, for matching src/dst vector types.
6340 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6341
6342 unsigned DstIdx = 0;
6343 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6344 // Check we have an in-range constant insertion index.
6345 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6346 N.getConstantOperandAPInt(2).uge(NumElts))
6347 return false;
6348 DstIdx = N.getConstantOperandVal(2);
6349
6350 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6351 if (X86::isZeroNode(Scl)) {
6352 Ops.push_back(N.getOperand(0));
6353 for (unsigned i = 0; i != NumElts; ++i)
6354 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6355 return true;
6356 }
6357 }
6358
6359 // Peek through trunc/aext/zext/bitcast.
6360 // TODO: aext shouldn't require SM_SentinelZero padding.
6361 // TODO: handle shift of scalars.
6362 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6363 while (Scl.getOpcode() == ISD::TRUNCATE ||
6364 Scl.getOpcode() == ISD::ANY_EXTEND ||
6365 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6366 (Scl.getOpcode() == ISD::BITCAST &&
6369 Scl = Scl.getOperand(0);
6370 MinBitsPerElt =
6371 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6372 }
6373 if ((MinBitsPerElt % 8) != 0)
6374 return false;
6375
6376 // Attempt to find the source vector the scalar was extracted from.
6377 SDValue SrcExtract;
6378 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6379 Scl.getOpcode() == X86ISD::PEXTRW ||
6380 Scl.getOpcode() == X86ISD::PEXTRB) &&
6381 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6382 SrcExtract = Scl;
6383 }
6384 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6385 return false;
6386
6387 SDValue SrcVec = SrcExtract.getOperand(0);
6388 EVT SrcVT = SrcVec.getValueType();
6389 if (!SrcVT.getScalarType().isByteSized())
6390 return false;
6391 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6392 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6393 unsigned DstByte = DstIdx * NumBytesPerElt;
6394 MinBitsPerElt =
6395 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6396
6397 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6398 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6399 Ops.push_back(SrcVec);
6400 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6401 } else {
6402 Ops.push_back(SrcVec);
6403 Ops.push_back(N.getOperand(0));
6404 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6405 Mask.push_back(NumSizeInBytes + i);
6406 }
6407
6408 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6409 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6410 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6411 Mask[DstByte + i] = SrcByte + i;
6412 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6413 Mask[DstByte + i] = SM_SentinelZero;
6414 return true;
6415 }
6416 case X86ISD::PACKSS:
6417 case X86ISD::PACKUS: {
6418 SDValue N0 = N.getOperand(0);
6419 SDValue N1 = N.getOperand(1);
6420 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6422 "Unexpected input value type");
6423
6424 APInt EltsLHS, EltsRHS;
6425 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6426
6427 // If we know input saturation won't happen (or we don't care for particular
6428 // lanes), we can treat this as a truncation shuffle.
6429 bool Offset0 = false, Offset1 = false;
6430 if (Opcode == X86ISD::PACKSS) {
6431 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6432 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6433 (!(N1.isUndef() || EltsRHS.isZero()) &&
6434 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6435 return false;
6436 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6437 // PACKSS then it was likely being used for sign-extension for a
6438 // truncation, so just peek through and adjust the mask accordingly.
6439 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6440 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6441 Offset0 = true;
6442 N0 = N0.getOperand(0);
6443 }
6444 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6445 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6446 Offset1 = true;
6447 N1 = N1.getOperand(0);
6448 }
6449 } else {
6450 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6451 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6452 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6453 (!(N1.isUndef() || EltsRHS.isZero()) &&
6454 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6455 return false;
6456 }
6457
6458 bool IsUnary = (N0 == N1);
6459
6460 Ops.push_back(N0);
6461 if (!IsUnary)
6462 Ops.push_back(N1);
6463
6464 createPackShuffleMask(VT, Mask, IsUnary);
6465
6466 if (Offset0 || Offset1) {
6467 for (int &M : Mask)
6468 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6469 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6470 ++M;
6471 }
6472 return true;
6473 }
6474 case ISD::VSELECT:
6475 case X86ISD::BLENDV: {
6476 SDValue Cond = N.getOperand(0);
6477 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6478 Ops.push_back(N.getOperand(1));
6479 Ops.push_back(N.getOperand(2));
6480 return true;
6481 }
6482 return false;
6483 }
6484 case X86ISD::VTRUNC: {
6485 SDValue Src = N.getOperand(0);
6486 EVT SrcVT = Src.getValueType();
6487 if (SrcVT.getSizeInBits() != NumSizeInBits)
6488 return false;
6489 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6490 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6491 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6492 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6493 for (unsigned i = 0; i != NumSrcElts; ++i)
6494 Mask.push_back(i * Scale);
6495 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6496 Ops.push_back(Src);
6497 return true;
6498 }
6499 case ISD::SHL:
6500 case ISD::SRL: {
6501 APInt UndefElts;
6502 SmallVector<APInt, 32> EltBits;
6503 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6504 UndefElts, EltBits,
6505 /*AllowWholeUndefs*/ true,
6506 /*AllowPartialUndefs*/ false))
6507 return false;
6508
6509 // We can only decode 'whole byte' bit shifts as shuffles.
6510 for (unsigned I = 0; I != NumElts; ++I)
6511 if (DemandedElts[I] && !UndefElts[I] &&
6512 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6513 return false;
6514
6515 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6516 Ops.push_back(N.getOperand(0));
6517
6518 for (unsigned I = 0; I != NumElts; ++I) {
6519 if (!DemandedElts[I] || UndefElts[I])
6520 continue;
6521 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6522 unsigned Lo = I * NumBytesPerElt;
6523 unsigned Hi = Lo + NumBytesPerElt;
6524 // Clear mask to all zeros and insert the shifted byte indices.
6525 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6526 if (ISD::SHL == Opcode)
6527 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6528 else
6529 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6530 Lo + ByteShift);
6531 }
6532 return true;
6533 }
6534 case X86ISD::VSHLI:
6535 case X86ISD::VSRLI: {
6536 uint64_t ShiftVal = N.getConstantOperandVal(1);
6537 // Out of range bit shifts are guaranteed to be zero.
6538 if (NumBitsPerElt <= ShiftVal) {
6539 Mask.append(NumElts, SM_SentinelZero);
6540 return true;
6541 }
6542
6543 // We can only decode 'whole byte' bit shifts as shuffles.
6544 if ((ShiftVal % 8) != 0)
6545 break;
6546
6547 uint64_t ByteShift = ShiftVal / 8;
6548 Ops.push_back(N.getOperand(0));
6549
6550 // Clear mask to all zeros and insert the shifted byte indices.
6551 Mask.append(NumSizeInBytes, SM_SentinelZero);
6552
6553 if (X86ISD::VSHLI == Opcode) {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j] = i + j - ByteShift;
6557 } else {
6558 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6559 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6560 Mask[i + j - ByteShift] = i + j;
6561 }
6562 return true;
6563 }
6564 case X86ISD::VROTLI:
6565 case X86ISD::VROTRI: {
6566 // We can only decode 'whole byte' bit rotates as shuffles.
6567 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6568 if ((RotateVal % 8) != 0)
6569 return false;
6570 Ops.push_back(N.getOperand(0));
6571 int Offset = RotateVal / 8;
6572 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6573 for (int i = 0; i != (int)NumElts; ++i) {
6574 int BaseIdx = i * NumBytesPerElt;
6575 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6576 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6577 }
6578 }
6579 return true;
6580 }
6581 case X86ISD::VBROADCAST: {
6582 SDValue Src = N.getOperand(0);
6583 if (!Src.getSimpleValueType().isVector()) {
6584 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6585 !isNullConstant(Src.getOperand(1)) ||
6586 Src.getOperand(0).getValueType().getScalarType() !=
6587 VT.getScalarType())
6588 return false;
6589 Src = Src.getOperand(0);
6590 }
6591 Ops.push_back(Src);
6592 Mask.append(NumElts, 0);
6593 return true;
6594 }
6596 SDValue Src = N.getOperand(0);
6597 EVT SrcVT = Src.getValueType();
6598 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6599
6600 // Extended source must be a simple vector.
6601 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6602 (NumBitsPerSrcElt % 8) != 0)
6603 return false;
6604
6605 // We can only handle all-signbits extensions.
6606 APInt DemandedSrcElts =
6607 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6608 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6609 return false;
6610
6611 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6612 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6613 for (unsigned I = 0; I != NumElts; ++I)
6614 Mask.append(Scale, I);
6615 Ops.push_back(Src);
6616 return true;
6617 }
6618 case ISD::ZERO_EXTEND:
6619 case ISD::ANY_EXTEND:
6622 SDValue Src = N.getOperand(0);
6623 EVT SrcVT = Src.getValueType();
6624
6625 // Extended source must be a simple vector.
6626 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6627 (SrcVT.getScalarSizeInBits() % 8) != 0)
6628 return false;
6629
6630 bool IsAnyExtend =
6631 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6632 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6633 IsAnyExtend, Mask);
6634 Ops.push_back(Src);
6635 return true;
6636 }
6637 }
6638
6639 return false;
6640}
6641
6642/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6644 SmallVectorImpl<int> &Mask) {
6645 int MaskWidth = Mask.size();
6646 SmallVector<SDValue, 16> UsedInputs;
6647 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6648 int lo = UsedInputs.size() * MaskWidth;
6649 int hi = lo + MaskWidth;
6650
6651 // Strip UNDEF input usage.
6652 if (Inputs[i].isUndef())
6653 for (int &M : Mask)
6654 if ((lo <= M) && (M < hi))
6655 M = SM_SentinelUndef;
6656
6657 // Check for unused inputs.
6658 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6659 for (int &M : Mask)
6660 if (lo <= M)
6661 M -= MaskWidth;
6662 continue;
6663 }
6664
6665 // Check for repeated inputs.
6666 bool IsRepeat = false;
6667 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6668 if (UsedInputs[j] != Inputs[i])
6669 continue;
6670 for (int &M : Mask)
6671 if (lo <= M)
6672 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6673 IsRepeat = true;
6674 break;
6675 }
6676 if (IsRepeat)
6677 continue;
6678
6679 UsedInputs.push_back(Inputs[i]);
6680 }
6681 Inputs = UsedInputs;
6682}
6683
6684/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6685/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6686/// Returns true if the target shuffle mask was decoded.
6687static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6690 APInt &KnownUndef, APInt &KnownZero,
6691 const SelectionDAG &DAG, unsigned Depth,
6692 bool ResolveKnownElts) {
6694 return false; // Limit search depth.
6695
6696 EVT VT = Op.getValueType();
6697 if (!VT.isSimple() || !VT.isVector())
6698 return false;
6699
6700 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6701 if (ResolveKnownElts)
6702 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6703 return true;
6704 }
6705 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6706 ResolveKnownElts)) {
6707 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6708 return true;
6709 }
6710 return false;
6711}
6712
6713static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6716 const SelectionDAG &DAG, unsigned Depth,
6717 bool ResolveKnownElts) {
6718 APInt KnownUndef, KnownZero;
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6720 KnownZero, DAG, Depth, ResolveKnownElts);
6721}
6722
6725 const SelectionDAG &DAG, unsigned Depth = 0,
6726 bool ResolveKnownElts = true) {
6727 EVT VT = Op.getValueType();
6728 if (!VT.isSimple() || !VT.isVector())
6729 return false;
6730
6731 unsigned NumElts = Op.getValueType().getVectorNumElements();
6732 APInt DemandedElts = APInt::getAllOnes(NumElts);
6733 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6734 ResolveKnownElts);
6735}
6736
6737// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6738static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6739 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6740 SelectionDAG &DAG) {
6741 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6742 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6743 "Unknown broadcast load type");
6744
6745 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6746 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6747 return SDValue();
6748
6751 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6752 SDValue Ops[] = {Mem->getChain(), Ptr};
6753 SDValue BcstLd = DAG.getMemIntrinsicNode(
6754 Opcode, DL, Tys, Ops, MemVT,
6756 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6757 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6758 return BcstLd;
6759}
6760
6761/// Returns the scalar element that will make up the i'th
6762/// element of the result of the vector shuffle.
6763static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6764 SelectionDAG &DAG, unsigned Depth) {
6766 return SDValue(); // Limit search depth.
6767
6768 EVT VT = Op.getValueType();
6769 unsigned Opcode = Op.getOpcode();
6770 unsigned NumElems = VT.getVectorNumElements();
6771
6772 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6773 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6774 int Elt = SV->getMaskElt(Index);
6775
6776 if (Elt < 0)
6777 return DAG.getUNDEF(VT.getVectorElementType());
6778
6779 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6780 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6781 }
6782
6783 // Recurse into target specific vector shuffles to find scalars.
6784 if (isTargetShuffle(Opcode)) {
6785 MVT ShufVT = VT.getSimpleVT();
6786 MVT ShufSVT = ShufVT.getVectorElementType();
6787 int NumElems = (int)ShufVT.getVectorNumElements();
6788 SmallVector<int, 16> ShuffleMask;
6790 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6791 return SDValue();
6792
6793 int Elt = ShuffleMask[Index];
6794 if (Elt == SM_SentinelZero)
6795 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6796 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6797 if (Elt == SM_SentinelUndef)
6798 return DAG.getUNDEF(ShufSVT);
6799
6800 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6801 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6802 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6803 }
6804
6805 // Recurse into insert_subvector base/sub vector to find scalars.
6806 if (Opcode == ISD::INSERT_SUBVECTOR) {
6807 SDValue Vec = Op.getOperand(0);
6808 SDValue Sub = Op.getOperand(1);
6809 uint64_t SubIdx = Op.getConstantOperandVal(2);
6810 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6811
6812 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6813 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6814 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6815 }
6816
6817 // Recurse into concat_vectors sub vector to find scalars.
6818 if (Opcode == ISD::CONCAT_VECTORS) {
6819 EVT SubVT = Op.getOperand(0).getValueType();
6820 unsigned NumSubElts = SubVT.getVectorNumElements();
6821 uint64_t SubIdx = Index / NumSubElts;
6822 uint64_t SubElt = Index % NumSubElts;
6823 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6824 }
6825
6826 // Recurse into extract_subvector src vector to find scalars.
6827 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6828 SDValue Src = Op.getOperand(0);
6829 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6830 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6831 }
6832
6833 // We only peek through bitcasts of the same vector width.
6834 if (Opcode == ISD::BITCAST) {
6835 SDValue Src = Op.getOperand(0);
6836 EVT SrcVT = Src.getValueType();
6837 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6838 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6839 return SDValue();
6840 }
6841
6842 // Actual nodes that may contain scalar elements
6843
6844 // For insert_vector_elt - either return the index matching scalar or recurse
6845 // into the base vector.
6846 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6847 isa<ConstantSDNode>(Op.getOperand(2))) {
6848 if (Op.getConstantOperandAPInt(2) == Index)
6849 return Op.getOperand(1);
6850 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6851 }
6852
6853 if (Opcode == ISD::SCALAR_TO_VECTOR)
6854 return (Index == 0) ? Op.getOperand(0)
6855 : DAG.getUNDEF(VT.getVectorElementType());
6856
6857 if (Opcode == ISD::BUILD_VECTOR)
6858 return Op.getOperand(Index);
6859
6860 return SDValue();
6861}
6862
6863// Use PINSRB/PINSRW/PINSRD to create a build vector.
6865 const APInt &NonZeroMask,
6866 unsigned NumNonZero, unsigned NumZero,
6867 SelectionDAG &DAG,
6868 const X86Subtarget &Subtarget) {
6869 MVT VT = Op.getSimpleValueType();
6870 unsigned NumElts = VT.getVectorNumElements();
6871 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6872 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6873 "Illegal vector insertion");
6874
6875 SDValue V;
6876 bool First = true;
6877
6878 for (unsigned i = 0; i < NumElts; ++i) {
6879 bool IsNonZero = NonZeroMask[i];
6880 if (!IsNonZero)
6881 continue;
6882
6883 // If the build vector contains zeros or our first insertion is not the
6884 // first index then insert into zero vector to break any register
6885 // dependency else use SCALAR_TO_VECTOR.
6886 if (First) {
6887 First = false;
6888 if (NumZero || 0 != i)
6889 V = getZeroVector(VT, Subtarget, DAG, DL);
6890 else {
6891 assert(0 == i && "Expected insertion into zero-index");
6892 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6893 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6894 V = DAG.getBitcast(VT, V);
6895 continue;
6896 }
6897 }
6898 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6899 DAG.getVectorIdxConstant(i, DL));
6900 }
6901
6902 return V;
6903}
6904
6905/// Custom lower build_vector of v16i8.
6907 const APInt &NonZeroMask,
6908 unsigned NumNonZero, unsigned NumZero,
6909 SelectionDAG &DAG,
6910 const X86Subtarget &Subtarget) {
6911 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 return SDValue();
6913
6914 // SSE4.1 - use PINSRB to insert each byte directly.
6915 if (Subtarget.hasSSE41())
6916 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6917 DAG, Subtarget);
6918
6919 SDValue V;
6920
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6923 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6924 !NonZeroMask.extractBits(2, 2).isZero()) {
6925 for (unsigned I = 0; I != 4; ++I) {
6926 if (!NonZeroMask[I])
6927 continue;
6928 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6929 if (I != 0)
6930 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6931 DAG.getConstant(I * 8, DL, MVT::i8));
6932 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6933 }
6934 assert(V && "Failed to fold v16i8 vector to zero");
6935 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6936 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6937 V = DAG.getBitcast(MVT::v8i16, V);
6938 }
6939 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6940 bool ThisIsNonZero = NonZeroMask[i];
6941 bool NextIsNonZero = NonZeroMask[i + 1];
6942 if (!ThisIsNonZero && !NextIsNonZero)
6943 continue;
6944
6945 SDValue Elt;
6946 if (ThisIsNonZero) {
6947 if (NumZero || NextIsNonZero)
6948 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6949 else
6950 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6951 }
6952
6953 if (NextIsNonZero) {
6954 SDValue NextElt = Op.getOperand(i + 1);
6955 if (i == 0 && NumZero)
6956 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6957 else
6958 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6959 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6960 DAG.getConstant(8, DL, MVT::i8));
6961 if (ThisIsNonZero)
6962 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6963 else
6964 Elt = NextElt;
6965 }
6966
6967 // If our first insertion is not the first index or zeros are needed, then
6968 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6969 // elements undefined).
6970 if (!V) {
6971 if (i != 0 || NumZero)
6972 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6973 else {
6974 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6975 V = DAG.getBitcast(MVT::v8i16, V);
6976 continue;
6977 }
6978 }
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6981 DAG.getVectorIdxConstant(i / 2, DL));
6982 }
6983
6984 return DAG.getBitcast(MVT::v16i8, V);
6985}
6986
6987/// Custom lower build_vector of v8i16.
6989 const APInt &NonZeroMask,
6990 unsigned NumNonZero, unsigned NumZero,
6991 SelectionDAG &DAG,
6992 const X86Subtarget &Subtarget) {
6993 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6994 return SDValue();
6995
6996 // Use PINSRW to insert each byte directly.
6997 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6998 Subtarget);
6999}
7000
7001/// Custom lower build_vector of v4i32 or v4f32.
7003 SelectionDAG &DAG,
7004 const X86Subtarget &Subtarget) {
7005 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7006 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7007 // Because we're creating a less complicated build vector here, we may enable
7008 // further folding of the MOVDDUP via shuffle transforms.
7009 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7010 Op.getOperand(0) == Op.getOperand(2) &&
7011 Op.getOperand(1) == Op.getOperand(3) &&
7012 Op.getOperand(0) != Op.getOperand(1)) {
7013 MVT VT = Op.getSimpleValueType();
7014 MVT EltVT = VT.getVectorElementType();
7015 // Create a new build vector with the first 2 elements followed by undef
7016 // padding, bitcast to v2f64, duplicate, and bitcast back.
7017 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7018 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7019 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7020 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7021 return DAG.getBitcast(VT, Dup);
7022 }
7023
7024 // Find all zeroable elements.
7025 std::bitset<4> Zeroable, Undefs;
7026 for (int i = 0; i < 4; ++i) {
7027 SDValue Elt = Op.getOperand(i);
7028 Undefs[i] = Elt.isUndef();
7029 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7030 }
7031 assert(Zeroable.size() - Zeroable.count() > 1 &&
7032 "We expect at least two non-zero elements!");
7033
7034 // We only know how to deal with build_vector nodes where elements are either
7035 // zeroable or extract_vector_elt with constant index.
7036 SDValue FirstNonZero;
7037 unsigned FirstNonZeroIdx;
7038 for (unsigned i = 0; i < 4; ++i) {
7039 if (Zeroable[i])
7040 continue;
7041 SDValue Elt = Op.getOperand(i);
7042 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7044 return SDValue();
7045 // Make sure that this node is extracting from a 128-bit vector.
7046 MVT VT = Elt.getOperand(0).getSimpleValueType();
7047 if (!VT.is128BitVector())
7048 return SDValue();
7049 if (!FirstNonZero.getNode()) {
7050 FirstNonZero = Elt;
7051 FirstNonZeroIdx = i;
7052 }
7053 }
7054
7055 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7056 SDValue V1 = FirstNonZero.getOperand(0);
7057 MVT VT = V1.getSimpleValueType();
7058
7059 // See if this build_vector can be lowered as a blend with zero.
7060 SDValue Elt;
7061 unsigned EltMaskIdx, EltIdx;
7062 int Mask[4];
7063 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7064 if (Zeroable[EltIdx]) {
7065 // The zero vector will be on the right hand side.
7066 Mask[EltIdx] = EltIdx+4;
7067 continue;
7068 }
7069
7070 Elt = Op->getOperand(EltIdx);
7071 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7072 EltMaskIdx = Elt.getConstantOperandVal(1);
7073 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7074 break;
7075 Mask[EltIdx] = EltIdx;
7076 }
7077
7078 if (EltIdx == 4) {
7079 // Let the shuffle legalizer deal with blend operations.
7080 SDValue VZeroOrUndef = (Zeroable == Undefs)
7081 ? DAG.getUNDEF(VT)
7082 : getZeroVector(VT, Subtarget, DAG, DL);
7083 if (V1.getSimpleValueType() != VT)
7084 V1 = DAG.getBitcast(VT, V1);
7085 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7086 }
7087
7088 // See if we can lower this build_vector to a INSERTPS.
7089 if (!Subtarget.hasSSE41())
7090 return SDValue();
7091
7092 SDValue V2 = Elt.getOperand(0);
7093 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7094 V1 = SDValue();
7095
7096 bool CanFold = true;
7097 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7098 if (Zeroable[i])
7099 continue;
7100
7101 SDValue Current = Op->getOperand(i);
7102 SDValue SrcVector = Current->getOperand(0);
7103 if (!V1.getNode())
7104 V1 = SrcVector;
7105 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7106 }
7107
7108 if (!CanFold)
7109 return SDValue();
7110
7111 assert(V1.getNode() && "Expected at least two non-zero elements!");
7112 if (V1.getSimpleValueType() != MVT::v4f32)
7113 V1 = DAG.getBitcast(MVT::v4f32, V1);
7114 if (V2.getSimpleValueType() != MVT::v4f32)
7115 V2 = DAG.getBitcast(MVT::v4f32, V2);
7116
7117 // Ok, we can emit an INSERTPS instruction.
7118 unsigned ZMask = Zeroable.to_ulong();
7119
7120 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7121 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7122 SDValue Result =
7123 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7124 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7125 return DAG.getBitcast(VT, Result);
7126}
7127
7128/// Return a vector logical shift node.
7129static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7130 SelectionDAG &DAG, const TargetLowering &TLI,
7131 const SDLoc &dl) {
7132 assert(VT.is128BitVector() && "Unknown type for VShift");
7133 MVT ShVT = MVT::v16i8;
7134 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7135 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7136 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7137 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7138 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7139}
7140
7142 SelectionDAG &DAG) {
7143
7144 // Check if the scalar load can be widened into a vector load. And if
7145 // the address is "base + cst" see if the cst can be "absorbed" into
7146 // the shuffle mask.
7148 SDValue Ptr = LD->getBasePtr();
7149 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7150 return SDValue();
7151 EVT PVT = LD->getValueType(0);
7152 if (PVT != MVT::i32 && PVT != MVT::f32)
7153 return SDValue();
7154
7155 int FI = -1;
7156 int64_t Offset = 0;
7158 FI = FINode->getIndex();
7159 Offset = 0;
7160 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7161 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7162 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7163 Offset = Ptr.getConstantOperandVal(1);
7164 Ptr = Ptr.getOperand(0);
7165 } else {
7166 return SDValue();
7167 }
7168
7169 // FIXME: 256-bit vector instructions don't require a strict alignment,
7170 // improve this code to support it better.
7171 Align RequiredAlign(VT.getSizeInBits() / 8);
7172 SDValue Chain = LD->getChain();
7173 // Make sure the stack object alignment is at least 16 or 32.
7175 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7176 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7177 if (MFI.isFixedObjectIndex(FI)) {
7178 // Can't change the alignment. FIXME: It's possible to compute
7179 // the exact stack offset and reference FI + adjust offset instead.
7180 // If someone *really* cares about this. That's the way to implement it.
7181 return SDValue();
7182 } else {
7183 MFI.setObjectAlignment(FI, RequiredAlign);
7184 }
7185 }
7186
7187 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7188 // Ptr + (Offset & ~15).
7189 if (Offset < 0)
7190 return SDValue();
7191 if ((Offset % RequiredAlign.value()) & 3)
7192 return SDValue();
7193 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7194 if (StartOffset) {
7195 SDLoc DL(Ptr);
7196 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7197 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7198 }
7199
7200 int EltNo = (Offset - StartOffset) >> 2;
7201 unsigned NumElems = VT.getVectorNumElements();
7202
7203 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7204 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7205 LD->getPointerInfo().getWithOffset(StartOffset));
7206
7207 SmallVector<int, 8> Mask(NumElems, EltNo);
7208
7209 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7210 }
7211
7212 return SDValue();
7213}
7214
7215// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7216static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7217 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7218 auto *BaseLd = cast<LoadSDNode>(Elt);
7219 if (!BaseLd->isSimple())
7220 return false;
7221 Ld = BaseLd;
7222 ByteOffset = 0;
7223 return true;
7224 }
7225
7226 switch (Elt.getOpcode()) {
7227 case ISD::BITCAST:
7228 case ISD::TRUNCATE:
7230 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7231 case ISD::SRL:
7232 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7233 uint64_t Amt = AmtC->getZExtValue();
7234 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7235 ByteOffset += Amt / 8;
7236 return true;
7237 }
7238 }
7239 break;
7241 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7242 SDValue Src = Elt.getOperand(0);
7243 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7244 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7245 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7246 findEltLoadSrc(Src, Ld, ByteOffset)) {
7247 uint64_t Idx = IdxC->getZExtValue();
7248 ByteOffset += Idx * (SrcSizeInBits / 8);
7249 return true;
7250 }
7251 }
7252 break;
7253 }
7254
7255 return false;
7256}
7257
7258/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7259/// elements can be replaced by a single large load which has the same value as
7260/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7261///
7262/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7264 const SDLoc &DL, SelectionDAG &DAG,
7265 const X86Subtarget &Subtarget,
7266 bool IsAfterLegalize) {
7267 if ((VT.getScalarSizeInBits() % 8) != 0)
7268 return SDValue();
7269
7270 unsigned NumElems = Elts.size();
7271
7272 int LastLoadedElt = -1;
7273 APInt LoadMask = APInt::getZero(NumElems);
7274 APInt ZeroMask = APInt::getZero(NumElems);
7275 APInt UndefMask = APInt::getZero(NumElems);
7276
7277 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7278 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7279
7280 // For each element in the initializer, see if we've found a load, zero or an
7281 // undef.
7282 for (unsigned i = 0; i < NumElems; ++i) {
7283 SDValue Elt = peekThroughBitcasts(Elts[i]);
7284 if (!Elt.getNode())
7285 return SDValue();
7286 if (Elt.isUndef()) {
7287 UndefMask.setBit(i);
7288 continue;
7289 }
7291 ZeroMask.setBit(i);
7292 continue;
7293 }
7294
7295 // Each loaded element must be the correct fractional portion of the
7296 // requested vector load.
7297 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7298 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7299 return SDValue();
7300
7301 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7302 return SDValue();
7303 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7304 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7305 return SDValue();
7306
7307 LoadMask.setBit(i);
7308 LastLoadedElt = i;
7309 }
7310 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7311 NumElems &&
7312 "Incomplete element masks");
7313
7314 // Handle Special Cases - all undef or undef/zero.
7315 if (UndefMask.popcount() == NumElems)
7316 return DAG.getUNDEF(VT);
7317 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7318 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7319 : DAG.getConstantFP(0.0, DL, VT);
7320
7321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7322 int FirstLoadedElt = LoadMask.countr_zero();
7323 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7324 EVT EltBaseVT = EltBase.getValueType();
7325 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7326 "Register/Memory size mismatch");
7327 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7328 assert(LDBase && "Did not find base load for merging consecutive loads");
7329 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7330 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7331 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7332 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7333 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7334
7335 // TODO: Support offsetting the base load.
7336 if (ByteOffsets[FirstLoadedElt] != 0)
7337 return SDValue();
7338
7339 // Check to see if the element's load is consecutive to the base load
7340 // or offset from a previous (already checked) load.
7341 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7342 LoadSDNode *Ld = Loads[EltIdx];
7343 int64_t ByteOffset = ByteOffsets[EltIdx];
7344 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7345 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7346 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7347 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7348 }
7349 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7350 EltIdx - FirstLoadedElt);
7351 };
7352
7353 // Consecutive loads can contain UNDEFS but not ZERO elements.
7354 // Consecutive loads with UNDEFs and ZEROs elements require a
7355 // an additional shuffle stage to clear the ZERO elements.
7356 bool IsConsecutiveLoad = true;
7357 bool IsConsecutiveLoadWithZeros = true;
7358 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7359 if (LoadMask[i]) {
7360 if (!CheckConsecutiveLoad(LDBase, i)) {
7361 IsConsecutiveLoad = false;
7362 IsConsecutiveLoadWithZeros = false;
7363 break;
7364 }
7365 } else if (ZeroMask[i]) {
7366 IsConsecutiveLoad = false;
7367 }
7368 }
7369
7370 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7371 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7372 assert(LDBase->isSimple() &&
7373 "Cannot merge volatile or atomic loads.");
7374 SDValue NewLd =
7375 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7376 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7377 for (auto *LD : Loads)
7378 if (LD)
7379 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7380 return NewLd;
7381 };
7382
7383 // Check if the base load is entirely dereferenceable.
7384 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7385 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7386
7387 // LOAD - all consecutive load/undefs (must start/end with a load or be
7388 // entirely dereferenceable). If we have found an entire vector of loads and
7389 // undefs, then return a large load of the entire vector width starting at the
7390 // base pointer. If the vector contains zeros, then attempt to shuffle those
7391 // elements.
7392 if (FirstLoadedElt == 0 &&
7393 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7394 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7395 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7396 return SDValue();
7397
7398 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7399 // will lower to regular temporal loads and use the cache.
7400 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7401 VT.is256BitVector() && !Subtarget.hasInt256())
7402 return SDValue();
7403
7404 if (NumElems == 1)
7405 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7406
7407 if (!ZeroMask)
7408 return CreateLoad(VT, LDBase);
7409
7410 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7411 // vector and a zero vector to clear out the zero elements.
7412 if (!IsAfterLegalize && VT.isVector()) {
7413 unsigned NumMaskElts = VT.getVectorNumElements();
7414 if ((NumMaskElts % NumElems) == 0) {
7415 unsigned Scale = NumMaskElts / NumElems;
7416 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7417 for (unsigned i = 0; i < NumElems; ++i) {
7418 if (UndefMask[i])
7419 continue;
7420 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7421 for (unsigned j = 0; j != Scale; ++j)
7422 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7423 }
7424 SDValue V = CreateLoad(VT, LDBase);
7425 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7426 : DAG.getConstantFP(0.0, DL, VT);
7427 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7428 }
7429 }
7430 }
7431
7432 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7433 if (VT.is256BitVector() || VT.is512BitVector()) {
7434 unsigned HalfNumElems = NumElems / 2;
7435 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7436 EVT HalfVT =
7437 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7438 SDValue HalfLD =
7439 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7440 DAG, Subtarget, IsAfterLegalize);
7441 if (HalfLD)
7442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7443 HalfLD, DAG.getVectorIdxConstant(0, DL));
7444 }
7445 }
7446
7447 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7448 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7449 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7450 LoadSizeInBits == 64) &&
7451 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7452 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7453 : MVT::getIntegerVT(LoadSizeInBits);
7454 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7455 // Allow v4f32 on SSE1 only targets.
7456 // FIXME: Add more isel patterns so we can just use VT directly.
7457 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7458 VecVT = MVT::v4f32;
7459 if (TLI.isTypeLegal(VecVT)) {
7460 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7461 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7462 SDValue ResNode = DAG.getMemIntrinsicNode(
7463 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7465 for (auto *LD : Loads)
7466 if (LD)
7467 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7468 return DAG.getBitcast(VT, ResNode);
7469 }
7470 }
7471
7472 // BROADCAST - match the smallest possible repetition pattern, load that
7473 // scalar/subvector element and then broadcast to the entire vector.
7474 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7475 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7476 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7477 unsigned RepeatSize = SubElems * BaseSizeInBits;
7478 unsigned ScalarSize = std::min(RepeatSize, 64u);
7479 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7480 continue;
7481
7482 // Don't attempt a 1:N subvector broadcast - it should be caught by
7483 // combineConcatVectorOps, else will cause infinite loops.
7484 if (RepeatSize > ScalarSize && SubElems == 1)
7485 continue;
7486
7487 bool Match = true;
7488 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7489 for (unsigned i = 0; i != NumElems && Match; ++i) {
7490 if (!LoadMask[i])
7491 continue;
7492 SDValue Elt = peekThroughBitcasts(Elts[i]);
7493 if (RepeatedLoads[i % SubElems].isUndef())
7494 RepeatedLoads[i % SubElems] = Elt;
7495 else
7496 Match &= (RepeatedLoads[i % SubElems] == Elt);
7497 }
7498
7499 // We must have loads at both ends of the repetition.
7500 Match &= !RepeatedLoads.front().isUndef();
7501 Match &= !RepeatedLoads.back().isUndef();
7502 if (!Match)
7503 continue;
7504
7505 EVT RepeatVT =
7506 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7507 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7508 : EVT::getFloatingPointVT(ScalarSize);
7509 if (RepeatSize > ScalarSize)
7510 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7511 RepeatSize / ScalarSize);
7512 EVT BroadcastVT =
7513 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7514 VT.getSizeInBits() / ScalarSize);
7515 if (TLI.isTypeLegal(BroadcastVT)) {
7516 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7517 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7518 SDValue Broadcast = RepeatLoad;
7519 if (RepeatSize > ScalarSize) {
7520 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7521 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7522 } else {
7523 if (!Subtarget.hasAVX2() &&
7525 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7526 Subtarget,
7527 /*AssumeSingleUse=*/true))
7528 return SDValue();
7529 Broadcast =
7530 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7531 }
7532 return DAG.getBitcast(VT, Broadcast);
7533 }
7534 }
7535 }
7536 }
7537
7538 return SDValue();
7539}
7540
7541// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7542// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7543// are consecutive, non-overlapping, and in the right order.
7545 SelectionDAG &DAG,
7546 const X86Subtarget &Subtarget,
7547 bool IsAfterLegalize) {
7549 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7550 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7551 Elts.push_back(Elt);
7552 continue;
7553 }
7554 return SDValue();
7555 }
7556 assert(Elts.size() == VT.getVectorNumElements());
7557 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7558 IsAfterLegalize);
7559}
7560
7562 const APInt &Undefs, LLVMContext &C) {
7563 unsigned ScalarSize = VT.getScalarSizeInBits();
7564 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7565
7566 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7567 if (VT.isFloatingPoint()) {
7568 if (ScalarSize == 16)
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7570 if (ScalarSize == 32)
7571 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7572 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7573 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7574 }
7575 return Constant::getIntegerValue(Ty, Val);
7576 };
7577
7578 SmallVector<Constant *, 32> ConstantVec;
7579 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7580 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7581 : getConstantScalar(Bits[I]));
7582
7583 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7584}
7585
7586static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7587 unsigned SplatBitSize, LLVMContext &C) {
7588 unsigned ScalarSize = VT.getScalarSizeInBits();
7589
7590 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7591 if (VT.isFloatingPoint()) {
7592 if (ScalarSize == 16)
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7594 if (ScalarSize == 32)
7595 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7596 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7597 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7598 }
7599 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7600 };
7601
7602 if (ScalarSize == SplatBitSize)
7603 return getConstantScalar(SplatValue);
7604
7605 unsigned NumElm = SplatBitSize / ScalarSize;
7606 SmallVector<Constant *, 32> ConstantVec;
7607 for (unsigned I = 0; I != NumElm; ++I) {
7608 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7609 ConstantVec.push_back(getConstantScalar(Val));
7610 }
7611 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7612}
7613
7615 for (auto *U : N->users()) {
7616 unsigned Opc = U->getOpcode();
7617 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7618 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7619 return false;
7620 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7621 return false;
7622 if (isTargetShuffle(Opc))
7623 return true;
7624 if (Opc == ISD::BITCAST) // Ignore bitcasts
7625 return isFoldableUseOfShuffle(U);
7626 if (N->hasOneUse()) {
7627 // TODO, there may be some general way to know if a SDNode can
7628 // be folded. We now only know whether an MI is foldable.
7629 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7630 return false;
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637// If the node has a single use by a VSELECT then AVX512 targets may be able to
7638// fold as a predicated instruction.
7639static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7640 unsigned SizeInBits = V.getValueSizeInBits();
7641 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7642 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7643 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7644 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7645 return true;
7646 }
7647 }
7648 return false;
7649}
7650
7651/// Attempt to use the vbroadcast instruction to generate a splat value
7652/// from a splat BUILD_VECTOR which uses:
7653/// a. A single scalar load, or a constant.
7654/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7655///
7656/// The VBROADCAST node is returned when a pattern is found,
7657/// or SDValue() otherwise.
7659 const SDLoc &dl,
7660 const X86Subtarget &Subtarget,
7661 SelectionDAG &DAG) {
7662 // VBROADCAST requires AVX.
7663 // TODO: Splats could be generated for non-AVX CPUs using SSE
7664 // instructions, but there's less potential gain for only 128-bit vectors.
7665 if (!Subtarget.hasAVX())
7666 return SDValue();
7667
7668 MVT VT = BVOp->getSimpleValueType(0);
7669 unsigned NumElts = VT.getVectorNumElements();
7670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7672 "Unsupported vector type for broadcast.");
7673
7674 // See if the build vector is a repeating sequence of scalars (inc. splat).
7675 SDValue Ld;
7676 BitVector UndefElements;
7677 SmallVector<SDValue, 16> Sequence;
7678 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7679 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7680 if (Sequence.size() == 1)
7681 Ld = Sequence[0];
7682 }
7683
7684 // Attempt to use VBROADCASTM
7685 // From this pattern:
7686 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7687 // b. t1 = (build_vector t0 t0)
7688 //
7689 // Create (VBROADCASTM v2i1 X)
7690 if (!Sequence.empty() && Subtarget.hasCDI()) {
7691 // If not a splat, are the upper sequence values zeroable?
7692 unsigned SeqLen = Sequence.size();
7693 bool UpperZeroOrUndef =
7694 SeqLen == 1 ||
7695 llvm::all_of(ArrayRef(Sequence).drop_front(),
7696 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7697 SDValue Op0 = Sequence[0];
7698 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7699 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7700 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7701 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7702 ? Op0.getOperand(0)
7703 : Op0.getOperand(0).getOperand(0);
7704 MVT MaskVT = BOperand.getSimpleValueType();
7705 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7706 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7707 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7708 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7709 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7710 unsigned Scale = 512 / VT.getSizeInBits();
7711 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7712 }
7713 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7714 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7715 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7716 return DAG.getBitcast(VT, Bcst);
7717 }
7718 }
7719 }
7720
7721 unsigned NumUndefElts = UndefElements.count();
7722 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7723 APInt SplatValue, Undef;
7724 unsigned SplatBitSize;
7725 bool HasUndef;
7726 // Check if this is a repeated constant pattern suitable for broadcasting.
7727 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7728 SplatBitSize > VT.getScalarSizeInBits() &&
7729 SplatBitSize < VT.getSizeInBits()) {
7730 // Avoid replacing with broadcast when it's a use of a shuffle
7731 // instruction to preserve the present custom lowering of shuffles.
7732 if (isFoldableUseOfShuffle(BVOp))
7733 return SDValue();
7734 // replace BUILD_VECTOR with broadcast of the repeated constants.
7735 LLVMContext *Ctx = DAG.getContext();
7736 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7737 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7738 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7739 // Load the constant scalar/subvector and broadcast it.
7740 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7741 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7742 SDValue CP = DAG.getConstantPool(C, PVT);
7743 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7744
7745 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7746 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7747 SDValue Ops[] = {DAG.getEntryNode(), CP};
7748 MachinePointerInfo MPI =
7750 SDValue Brdcst =
7752 MPI, Alignment, MachineMemOperand::MOLoad);
7753 return DAG.getBitcast(VT, Brdcst);
7754 }
7755 if (SplatBitSize > 64) {
7756 // Load the vector of constants and broadcast it.
7757 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7758 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7759 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7760 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7761 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7762 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7763 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7764 MachinePointerInfo MPI =
7767 Ops, VVT, MPI, Alignment,
7769 }
7770 }
7771
7772 // If we are moving a scalar into a vector (Ld must be set and all elements
7773 // but 1 are undef) and that operation is not obviously supported by
7774 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7775 // That's better than general shuffling and may eliminate a load to GPR and
7776 // move from scalar to vector register.
7777 if (!Ld || NumElts - NumUndefElts != 1)
7778 return SDValue();
7779 unsigned ScalarSize = Ld.getValueSizeInBits();
7780 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7781 return SDValue();
7782 }
7783
7784 bool ConstSplatVal =
7785 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7786 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7787
7788 // TODO: Handle broadcasts of non-constant sequences.
7789
7790 // Make sure that all of the users of a non-constant load are from the
7791 // BUILD_VECTOR node.
7792 // FIXME: Is the use count needed for non-constant, non-load case?
7793 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7794 return SDValue();
7795
7796 unsigned ScalarSize = Ld.getValueSizeInBits();
7797 bool IsGE256 = (VT.getSizeInBits() >= 256);
7798
7799 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7800 // instruction to save 8 or more bytes of constant pool data.
7801 // TODO: If multiple splats are generated to load the same constant,
7802 // it may be detrimental to overall size. There needs to be a way to detect
7803 // that condition to know if this is truly a size win.
7804 bool OptForSize = DAG.shouldOptForSize();
7805
7806 // Handle broadcasting a single constant scalar from the constant pool
7807 // into a vector.
7808 // On Sandybridge (no AVX2), it is still better to load a constant vector
7809 // from the constant pool and not to broadcast it from a scalar.
7810 // But override that restriction when optimizing for size.
7811 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7812 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7813 EVT CVT = Ld.getValueType();
7814 assert(!CVT.isVector() && "Must not broadcast a vector type");
7815
7816 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7817 // For size optimization, also splat v2f64 and v2i64, and for size opt
7818 // with AVX2, also splat i8 and i16.
7819 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7820 if (ScalarSize == 32 ||
7821 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7822 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7823 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7824 const Constant *C = nullptr;
7826 C = CI->getConstantIntValue();
7828 C = CF->getConstantFPValue();
7829
7830 assert(C && "Invalid constant type");
7831
7832 SDValue CP =
7834 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7835
7836 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7837 SDValue Ops[] = {DAG.getEntryNode(), CP};
7838 MachinePointerInfo MPI =
7840 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7841 MPI, Alignment, MachineMemOperand::MOLoad);
7842 }
7843 }
7844
7845 // Handle AVX2 in-register broadcasts.
7846 if (!IsLoad && Subtarget.hasInt256() &&
7847 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7848 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7849
7850 // The scalar source must be a normal load.
7851 if (!IsLoad)
7852 return SDValue();
7853
7854 // Make sure the non-chain result is only used by this build vector.
7855 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7856 return SDValue();
7857
7858 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7859 (Subtarget.hasVLX() && ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7871 // double since there is no vbroadcastsd xmm
7872 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7873 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7874 auto *LN = cast<LoadSDNode>(Ld);
7875 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7876 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7877 SDValue BCast =
7879 LN->getMemoryVT(), LN->getMemOperand());
7880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7881 return BCast;
7882 }
7883
7884 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7886
7887 // Unsupported broadcast.
7888 return SDValue();
7889}
7890
7891/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7892/// underlying vector and index.
7893///
7894/// Modifies \p ExtractedFromVec to the real vector and returns the real
7895/// index.
7896static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7897 SDValue ExtIdx) {
7898 int Idx = ExtIdx->getAsZExtVal();
7899 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7900 return Idx;
7901
7902 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7903 // lowered this:
7904 // (extract_vector_elt (v8f32 %1), Constant<6>)
7905 // to:
7906 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7907 // (extract_subvector (v8f32 %0), Constant<4>),
7908 // undef)
7909 // Constant<0>)
7910 // In this case the vector is the extract_subvector expression and the index
7911 // is 2, as specified by the shuffle.
7912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7913 SDValue ShuffleVec = SVOp->getOperand(0);
7914 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7915 assert(ShuffleVecVT.getVectorElementType() ==
7916 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7917
7918 int ShuffleIdx = SVOp->getMaskElt(Idx);
7919 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7920 ExtractedFromVec = ShuffleVec;
7921 return ShuffleIdx;
7922 }
7923 return Idx;
7924}
7925
7927 SelectionDAG &DAG) {
7928 MVT VT = Op.getSimpleValueType();
7929
7930 // Skip if insert_vec_elt is not supported.
7931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7933 return SDValue();
7934
7935 unsigned NumElems = Op.getNumOperands();
7936 SDValue VecIn1;
7937 SDValue VecIn2;
7938 SmallVector<unsigned, 4> InsertIndices;
7939 SmallVector<int, 8> Mask(NumElems, -1);
7940
7941 for (unsigned i = 0; i != NumElems; ++i) {
7942 unsigned Opc = Op.getOperand(i).getOpcode();
7943
7944 if (Opc == ISD::UNDEF)
7945 continue;
7946
7948 // Quit if more than 1 elements need inserting.
7949 if (InsertIndices.size() > 1)
7950 return SDValue();
7951
7952 InsertIndices.push_back(i);
7953 continue;
7954 }
7955
7956 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7957 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7958
7959 // Quit if non-constant index.
7960 if (!isa<ConstantSDNode>(ExtIdx))
7961 return SDValue();
7962 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7963
7964 // Quit if extracted from vector of different type.
7965 if (ExtractedFromVec.getValueType() != VT)
7966 return SDValue();
7967
7968 if (!VecIn1.getNode())
7969 VecIn1 = ExtractedFromVec;
7970 else if (VecIn1 != ExtractedFromVec) {
7971 if (!VecIn2.getNode())
7972 VecIn2 = ExtractedFromVec;
7973 else if (VecIn2 != ExtractedFromVec)
7974 // Quit if more than 2 vectors to shuffle
7975 return SDValue();
7976 }
7977
7978 if (ExtractedFromVec == VecIn1)
7979 Mask[i] = Idx;
7980 else if (ExtractedFromVec == VecIn2)
7981 Mask[i] = Idx + NumElems;
7982 }
7983
7984 if (!VecIn1.getNode())
7985 return SDValue();
7986
7987 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7988 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7989
7990 for (unsigned Idx : InsertIndices)
7991 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7992 DAG.getVectorIdxConstant(Idx, DL));
7993
7994 return NV;
7995}
7996
7997// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7999 const X86Subtarget &Subtarget) {
8000 MVT VT = Op.getSimpleValueType();
8001 MVT IVT =
8002 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8004 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8005 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8006 Op.getOperand(I)));
8007 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8008 return DAG.getBitcast(VT, Res);
8009}
8010
8011// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8013 SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8015
8016 MVT VT = Op.getSimpleValueType();
8017 assert((VT.getVectorElementType() == MVT::i1) &&
8018 "Unexpected type in LowerBUILD_VECTORvXi1!");
8019 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8020 ISD::isBuildVectorAllOnes(Op.getNode()))
8021 return Op;
8022
8023 uint64_t Immediate = 0;
8024 SmallVector<unsigned, 16> NonConstIdx;
8025 bool IsSplat = true;
8026 bool HasConstElts = false;
8027 int SplatIdx = -1;
8028 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8029 SDValue In = Op.getOperand(idx);
8030 if (In.isUndef())
8031 continue;
8032 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8033 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8034 HasConstElts = true;
8035 } else {
8036 NonConstIdx.push_back(idx);
8037 }
8038 if (SplatIdx < 0)
8039 SplatIdx = idx;
8040 else if (In != Op.getOperand(SplatIdx))
8041 IsSplat = false;
8042 }
8043
8044 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8045 if (IsSplat) {
8046 // The build_vector allows the scalar element to be larger than the vector
8047 // element type. We need to mask it to use as a condition unless we know
8048 // the upper bits are zero.
8049 // FIXME: Use computeKnownBits instead of checking specific opcode?
8050 SDValue Cond = Op.getOperand(SplatIdx);
8051 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8052 if (Cond.getOpcode() != ISD::SETCC)
8053 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8054 DAG.getConstant(1, dl, MVT::i8));
8055
8056 // Perform the select in the scalar domain so we can use cmov.
8057 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8058 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8059 DAG.getAllOnesConstant(dl, MVT::i32),
8060 DAG.getConstant(0, dl, MVT::i32));
8061 Select = DAG.getBitcast(MVT::v32i1, Select);
8062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8063 } else {
8064 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8065 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8066 DAG.getAllOnesConstant(dl, ImmVT),
8067 DAG.getConstant(0, dl, ImmVT));
8068 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8069 Select = DAG.getBitcast(VecVT, Select);
8070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8071 DAG.getVectorIdxConstant(0, dl));
8072 }
8073 }
8074
8075 // insert elements one by one
8076 SDValue DstVec;
8077 if (HasConstElts) {
8078 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8079 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8080 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8081 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8082 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8083 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8084 } else {
8085 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8086 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8087 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8088 DstVec = DAG.getBitcast(VecVT, Imm);
8089 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8090 DAG.getVectorIdxConstant(0, dl));
8091 }
8092 } else
8093 DstVec = DAG.getUNDEF(VT);
8094
8095 for (unsigned InsertIdx : NonConstIdx) {
8096 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8097 Op.getOperand(InsertIdx),
8098 DAG.getVectorIdxConstant(InsertIdx, dl));
8099 }
8100 return DstVec;
8101}
8102
8103LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8104 switch (Opcode) {
8105 case X86ISD::PACKSS:
8106 case X86ISD::PACKUS:
8107 case X86ISD::FHADD:
8108 case X86ISD::FHSUB:
8109 case X86ISD::HADD:
8110 case X86ISD::HSUB:
8111 return true;
8112 }
8113 return false;
8114}
8115
8116/// This is a helper function of LowerToHorizontalOp().
8117/// This function checks that the build_vector \p N in input implements a
8118/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8119/// may not match the layout of an x86 256-bit horizontal instruction.
8120/// In other words, if this returns true, then some extraction/insertion will
8121/// be required to produce a valid horizontal instruction.
8122///
8123/// Parameter \p Opcode defines the kind of horizontal operation to match.
8124/// For example, if \p Opcode is equal to ISD::ADD, then this function
8125/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8126/// is equal to ISD::SUB, then this function checks if this is a horizontal
8127/// arithmetic sub.
8128///
8129/// This function only analyzes elements of \p N whose indices are
8130/// in range [BaseIdx, LastIdx).
8131///
8132/// TODO: This function was originally used to match both real and fake partial
8133/// horizontal operations, but the index-matching logic is incorrect for that.
8134/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8135/// code because it is only used for partial h-op matching now?
8136static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8137 const SDLoc &DL, SelectionDAG &DAG,
8138 unsigned BaseIdx, unsigned LastIdx,
8139 SDValue &V0, SDValue &V1) {
8140 EVT VT = N->getValueType(0);
8141 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8142 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8143 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8144 "Invalid Vector in input!");
8145
8146 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8147 bool CanFold = true;
8148 unsigned ExpectedVExtractIdx = BaseIdx;
8149 unsigned NumElts = LastIdx - BaseIdx;
8150 V0 = DAG.getUNDEF(VT);
8151 V1 = DAG.getUNDEF(VT);
8152
8153 // Check if N implements a horizontal binop.
8154 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8155 SDValue Op = N->getOperand(i + BaseIdx);
8156
8157 // Skip UNDEFs.
8158 if (Op->isUndef()) {
8159 // Update the expected vector extract index.
8160 if (i * 2 == NumElts)
8161 ExpectedVExtractIdx = BaseIdx;
8162 ExpectedVExtractIdx += 2;
8163 continue;
8164 }
8165
8166 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8167
8168 if (!CanFold)
8169 break;
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173
8174 // Try to match the following pattern:
8175 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8176 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8178 Op0.getOperand(0) == Op1.getOperand(0) &&
8181 if (!CanFold)
8182 break;
8183
8184 unsigned I0 = Op0.getConstantOperandVal(1);
8185 unsigned I1 = Op1.getConstantOperandVal(1);
8186
8187 if (i * 2 < NumElts) {
8188 if (V0.isUndef()) {
8189 V0 = Op0.getOperand(0);
8190 if (V0.getValueType() != VT)
8191 return false;
8192 }
8193 } else {
8194 if (V1.isUndef()) {
8195 V1 = Op0.getOperand(0);
8196 if (V1.getValueType() != VT)
8197 return false;
8198 }
8199 if (i * 2 == NumElts)
8200 ExpectedVExtractIdx = BaseIdx;
8201 }
8202
8203 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8204 if (I0 == ExpectedVExtractIdx)
8205 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8206 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8207 // Try to match the following dag sequence:
8208 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8209 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8210 } else
8211 CanFold = false;
8212
8213 ExpectedVExtractIdx += 2;
8214 }
8215
8216 return CanFold;
8217}
8218
8219/// Emit a sequence of two 128-bit horizontal add/sub followed by
8220/// a concat_vector.
8221///
8222/// This is a helper function of LowerToHorizontalOp().
8223/// This function expects two 256-bit vectors called V0 and V1.
8224/// At first, each vector is split into two separate 128-bit vectors.
8225/// Then, the resulting 128-bit vectors are used to implement two
8226/// horizontal binary operations.
8227///
8228/// The kind of horizontal binary operation is defined by \p X86Opcode.
8229///
8230/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8231/// the two new horizontal binop.
8232/// When Mode is set, the first horizontal binop dag node would take as input
8233/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8234/// horizontal binop dag node would take as input the lower 128-bit of V1
8235/// and the upper 128-bit of V1.
8236/// Example:
8237/// HADD V0_LO, V0_HI
8238/// HADD V1_LO, V1_HI
8239///
8240/// Otherwise, the first horizontal binop dag node takes as input the lower
8241/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8242/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8243/// Example:
8244/// HADD V0_LO, V1_LO
8245/// HADD V0_HI, V1_HI
8246///
8247/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8248/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8249/// the upper 128-bits of the result.
8250static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8251 const SDLoc &DL, SelectionDAG &DAG,
8252 unsigned X86Opcode, bool Mode,
8253 bool isUndefLO, bool isUndefHI) {
8254 MVT VT = V0.getSimpleValueType();
8255 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8256 "Invalid nodes in input!");
8257
8258 unsigned NumElts = VT.getVectorNumElements();
8259 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8260 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8261 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8262 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8263 MVT NewVT = V0_LO.getSimpleValueType();
8264
8265 SDValue LO = DAG.getUNDEF(NewVT);
8266 SDValue HI = DAG.getUNDEF(NewVT);
8267
8268 if (Mode) {
8269 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8270 if (!isUndefLO && !V0->isUndef())
8271 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8272 if (!isUndefHI && !V1->isUndef())
8273 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8274 } else {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8278
8279 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8280 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8281 }
8282
8283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8284}
8285
8286/// Returns true iff \p BV builds a vector with the result equivalent to
8287/// the result of ADDSUB/SUBADD operation.
8288/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8289/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8290/// \p Opnd0 and \p Opnd1.
8292 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8293 SDValue &Opnd0, SDValue &Opnd1,
8294 unsigned &NumExtracts, bool &IsSubAdd,
8295 bool &HasAllowContract) {
8296 using namespace SDPatternMatch;
8297
8298 MVT VT = BV->getSimpleValueType(0);
8299 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8300 return false;
8301
8302 unsigned NumElts = VT.getVectorNumElements();
8303 SDValue InVec0 = DAG.getUNDEF(VT);
8304 SDValue InVec1 = DAG.getUNDEF(VT);
8305
8306 NumExtracts = 0;
8307 HasAllowContract = NumElts != 0;
8308
8309 // Odd-numbered elements in the input build vector are obtained from
8310 // adding/subtracting two integer/float elements.
8311 // Even-numbered elements in the input build vector are obtained from
8312 // subtracting/adding two integer/float elements.
8313 unsigned Opc[2] = {0, 0};
8314 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8315 SDValue Op = BV->getOperand(i);
8316
8317 // Skip 'undef' values.
8318 unsigned Opcode = Op.getOpcode();
8319 if (Opcode == ISD::UNDEF)
8320 continue;
8321
8322 // Early exit if we found an unexpected opcode.
8323 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8324 return false;
8325
8326 SDValue Op0 = Op.getOperand(0);
8327 SDValue Op1 = Op.getOperand(1);
8328
8329 // Try to match the following pattern:
8330 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8331 // Early exit if we cannot match that sequence.
8332 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8333 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8334 return false;
8335
8336 // We found a valid add/sub node, make sure its the same opcode as previous
8337 // elements for this parity.
8338 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8339 return false;
8340 Opc[i % 2] = Opcode;
8341
8342 // Update InVec0 and InVec1.
8343 if (InVec0.isUndef())
8344 InVec0 = Op0.getOperand(0);
8345 if (InVec1.isUndef())
8346 InVec1 = Op1.getOperand(0);
8347
8348 // Make sure that operands in input to each add/sub node always
8349 // come from a same pair of vectors.
8350 if (InVec0 != Op0.getOperand(0)) {
8351 if (Opcode == ISD::FSUB)
8352 return false;
8353
8354 // FADD is commutable. Try to commute the operands
8355 // and then test again.
8356 std::swap(Op0, Op1);
8357 if (InVec0 != Op0.getOperand(0))
8358 return false;
8359 }
8360
8361 if (InVec1 != Op1.getOperand(0))
8362 return false;
8363
8364 // Increment the number of extractions done.
8365 ++NumExtracts;
8366 HasAllowContract &= Op->getFlags().hasAllowContract();
8367 }
8368
8369 // Ensure we have found an opcode for both parities and that they are
8370 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8371 // inputs are undef.
8372 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8373 InVec0.isUndef() || InVec1.isUndef())
8374 return false;
8375
8376 IsSubAdd = Opc[0] == ISD::FADD;
8377
8378 Opnd0 = InVec0;
8379 Opnd1 = InVec1;
8380 return true;
8381}
8382
8383/// Returns true if is possible to fold MUL and an idiom that has already been
8384/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8385/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8386/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8387///
8388/// Prior to calling this function it should be known that there is some
8389/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8390/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8391/// before replacement of such SDNode with ADDSUB operation. Thus the number
8392/// of \p Opnd0 uses is expected to be equal to 2.
8393/// For example, this function may be called for the following IR:
8394/// %AB = fmul fast <2 x double> %A, %B
8395/// %Sub = fsub fast <2 x double> %AB, %C
8396/// %Add = fadd fast <2 x double> %AB, %C
8397/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8398/// <2 x i32> <i32 0, i32 3>
8399/// There is a def for %Addsub here, which potentially can be replaced by
8400/// X86ISD::ADDSUB operation:
8401/// %Addsub = X86ISD::ADDSUB %AB, %C
8402/// and such ADDSUB can further be replaced with FMADDSUB:
8403/// %Addsub = FMADDSUB %A, %B, %C.
8404///
8405/// The main reason why this method is called before the replacement of the
8406/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8407/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8408/// FMADDSUB is.
8409static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8410 SelectionDAG &DAG, SDValue &Opnd0,
8411 SDValue &Opnd1, SDValue &Opnd2,
8412 unsigned ExpectedUses,
8413 bool AllowSubAddOrAddSubContract) {
8414 if (Opnd0.getOpcode() != ISD::FMUL ||
8415 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8416 return false;
8417
8418 // FIXME: These checks must match the similar ones in
8419 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8420 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8421 // or MUL + ADDSUB to FMADDSUB.
8422 const TargetOptions &Options = DAG.getTarget().Options;
8423 bool AllowFusion =
8424 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8425 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8426 if (!AllowFusion)
8427 return false;
8428
8429 Opnd2 = Opnd1;
8430 Opnd1 = Opnd0.getOperand(1);
8431 Opnd0 = Opnd0.getOperand(0);
8432
8433 return true;
8434}
8435
8436/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8437/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8438/// X86ISD::FMSUBADD node.
8440 const SDLoc &DL,
8441 const X86Subtarget &Subtarget,
8442 SelectionDAG &DAG) {
8443 SDValue Opnd0, Opnd1;
8444 unsigned NumExtracts;
8445 bool IsSubAdd;
8446 bool HasAllowContract;
8447 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8448 HasAllowContract))
8449 return SDValue();
8450
8451 MVT VT = BV->getSimpleValueType(0);
8452
8453 // Try to generate X86ISD::FMADDSUB node here.
8454 SDValue Opnd2;
8455 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8456 HasAllowContract)) {
8457 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8458 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8459 }
8460
8461 // We only support ADDSUB.
8462 if (IsSubAdd)
8463 return SDValue();
8464
8465 // There are no known X86 targets with 512-bit ADDSUB instructions!
8466 // Convert to blend(fsub,fadd).
8467 if (VT.is512BitVector()) {
8468 SmallVector<int> Mask;
8469 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8470 Mask.push_back(I);
8471 Mask.push_back(I + E + 1);
8472 }
8473 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8474 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8475 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8476 }
8477
8478 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8479}
8480
8482 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8483 // Initialize outputs to known values.
8484 MVT VT = BV->getSimpleValueType(0);
8485 HOpcode = ISD::DELETED_NODE;
8486 V0 = DAG.getUNDEF(VT);
8487 V1 = DAG.getUNDEF(VT);
8488
8489 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8490 // half of the result is calculated independently from the 128-bit halves of
8491 // the inputs, so that makes the index-checking logic below more complicated.
8492 unsigned NumElts = VT.getVectorNumElements();
8493 unsigned GenericOpcode = ISD::DELETED_NODE;
8494 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8495 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8496 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8497 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8498 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8499 // Ignore undef elements.
8500 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8501 if (Op.isUndef())
8502 continue;
8503
8504 // If there's an opcode mismatch, we're done.
8505 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8506 return false;
8507
8508 // Initialize horizontal opcode.
8509 if (HOpcode == ISD::DELETED_NODE) {
8510 GenericOpcode = Op.getOpcode();
8511 switch (GenericOpcode) {
8512 // clang-format off
8513 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8514 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8515 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8516 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8517 default: return false;
8518 // clang-format on
8519 }
8520 }
8521
8522 SDValue Op0 = Op.getOperand(0);
8523 SDValue Op1 = Op.getOperand(1);
8524 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8526 Op0.getOperand(0) != Op1.getOperand(0) ||
8528 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8529 return false;
8530
8531 // The source vector is chosen based on which 64-bit half of the
8532 // destination vector is being calculated.
8533 if (j < NumEltsIn64Bits) {
8534 if (V0.isUndef())
8535 V0 = Op0.getOperand(0);
8536 } else {
8537 if (V1.isUndef())
8538 V1 = Op0.getOperand(0);
8539 }
8540
8541 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8542 if (SourceVec != Op0.getOperand(0))
8543 return false;
8544
8545 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8546 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8547 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8548 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8549 (j % NumEltsIn64Bits) * 2;
8550 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8551 continue;
8552
8553 // If this is not a commutative op, this does not match.
8554 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8555 return false;
8556
8557 // Addition is commutative, so try swapping the extract indexes.
8558 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8559 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8560 continue;
8561
8562 // Extract indexes do not match horizontal requirement.
8563 return false;
8564 }
8565 }
8566 // We matched. Opcode and operands are returned by reference as arguments.
8567 return true;
8568}
8569
8571 const SDLoc &DL, SelectionDAG &DAG,
8572 unsigned HOpcode, SDValue V0, SDValue V1) {
8573 // If either input vector is not the same size as the build vector,
8574 // extract/insert the low bits to the correct size.
8575 // This is free (examples: zmm --> xmm, xmm --> ymm).
8576 MVT VT = BV->getSimpleValueType(0);
8577 unsigned Width = VT.getSizeInBits();
8578 if (V0.getValueSizeInBits() > Width)
8579 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8580 else if (V0.getValueSizeInBits() < Width)
8581 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8582
8583 if (V1.getValueSizeInBits() > Width)
8584 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8585 else if (V1.getValueSizeInBits() < Width)
8586 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8587
8588 unsigned NumElts = VT.getVectorNumElements();
8589 APInt DemandedElts = APInt::getAllOnes(NumElts);
8590 for (unsigned i = 0; i != NumElts; ++i)
8591 if (BV->getOperand(i).isUndef())
8592 DemandedElts.clearBit(i);
8593
8594 // If we don't need the upper xmm, then perform as a xmm hop.
8595 unsigned HalfNumElts = NumElts / 2;
8596 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8597 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8598 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8599 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8600 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8601 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8602 }
8603
8604 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8605}
8606
8607/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8609 const X86Subtarget &Subtarget,
8610 SelectionDAG &DAG) {
8611 // We need at least 2 non-undef elements to make this worthwhile by default.
8612 unsigned NumNonUndefs =
8613 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8614 if (NumNonUndefs < 2)
8615 return SDValue();
8616
8617 // There are 4 sets of horizontal math operations distinguished by type:
8618 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8619 // subtarget feature. Try to match those "native" patterns first.
8620 MVT VT = BV->getSimpleValueType(0);
8621 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8622 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8623 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8624 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8625 unsigned HOpcode;
8626 SDValue V0, V1;
8627 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8628 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8629 }
8630
8631 // Try harder to match 256-bit ops by using extract/concat.
8632 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8633 return SDValue();
8634
8635 // Count the number of UNDEF operands in the build_vector in input.
8636 unsigned NumElts = VT.getVectorNumElements();
8637 unsigned Half = NumElts / 2;
8638 unsigned NumUndefsLO = 0;
8639 unsigned NumUndefsHI = 0;
8640 for (unsigned i = 0, e = Half; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsLO++;
8643
8644 for (unsigned i = Half, e = NumElts; i != e; ++i)
8645 if (BV->getOperand(i)->isUndef())
8646 NumUndefsHI++;
8647
8648 SDValue InVec0, InVec1;
8649 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8650 SDValue InVec2, InVec3;
8651 unsigned X86Opcode;
8652 bool CanFold = true;
8653
8654 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8655 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8656 InVec3) &&
8657 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8658 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8659 X86Opcode = X86ISD::HADD;
8660 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8661 InVec1) &&
8662 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8663 InVec3) &&
8664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8666 X86Opcode = X86ISD::HSUB;
8667 else
8668 CanFold = false;
8669
8670 if (CanFold) {
8671 // Do not try to expand this build_vector into a pair of horizontal
8672 // add/sub if we can emit a pair of scalar add/sub.
8673 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8674 return SDValue();
8675
8676 // Convert this build_vector into a pair of horizontal binops followed by
8677 // a concat vector. We must adjust the outputs from the partial horizontal
8678 // matching calls above to account for undefined vector halves.
8679 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8680 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8681 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8682 bool isUndefLO = NumUndefsLO == Half;
8683 bool isUndefHI = NumUndefsHI == Half;
8684 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8685 isUndefHI);
8686 }
8687 }
8688
8689 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8690 VT == MVT::v16i16) {
8691 unsigned X86Opcode;
8692 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HADD;
8695 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::HSUB;
8698 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::FHSUB;
8704 else
8705 return SDValue();
8706
8707 // Don't try to expand this build_vector into a pair of horizontal add/sub
8708 // if we can simply emit a pair of scalar add/sub.
8709 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8710 return SDValue();
8711
8712 // Convert this build_vector into two horizontal add/sub followed by
8713 // a concat vector.
8714 bool isUndefLO = NumUndefsLO == Half;
8715 bool isUndefHI = NumUndefsHI == Half;
8716 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8717 isUndefLO, isUndefHI);
8718 }
8719
8720 return SDValue();
8721}
8722
8723static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG);
8725
8726/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8727/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8728/// just apply the bit to the vectors.
8729/// NOTE: Its not in our interest to start make a general purpose vectorizer
8730/// from this, but enough scalar bit operations are created from the later
8731/// legalization + scalarization stages to need basic support.
8733 const X86Subtarget &Subtarget,
8734 SelectionDAG &DAG) {
8735 MVT VT = Op->getSimpleValueType(0);
8736 unsigned NumElems = VT.getVectorNumElements();
8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8738
8739 // Check that all elements have the same opcode.
8740 // TODO: Should we allow UNDEFS and if so how many?
8741 unsigned Opcode = Op->getOperand(0).getOpcode();
8742 for (unsigned i = 1; i < NumElems; ++i)
8743 if (Opcode != Op->getOperand(i).getOpcode())
8744 return SDValue();
8745
8746 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8747 bool IsShift = false;
8748 switch (Opcode) {
8749 default:
8750 return SDValue();
8751 case ISD::SHL:
8752 case ISD::SRL:
8753 case ISD::SRA:
8754 IsShift = true;
8755 break;
8756 case ISD::AND:
8757 case ISD::XOR:
8758 case ISD::OR:
8759 // Don't do this if the buildvector is a splat - we'd replace one
8760 // constant with an entire vector.
8761 if (Op->getSplatValue())
8762 return SDValue();
8763 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8764 return SDValue();
8765 break;
8766 }
8767
8768 SmallVector<SDValue, 4> LHSElts, RHSElts;
8769 for (SDValue Elt : Op->ops()) {
8770 SDValue LHS = Elt.getOperand(0);
8771 SDValue RHS = Elt.getOperand(1);
8772
8773 // We expect the canonicalized RHS operand to be the constant.
8775 return SDValue();
8776
8777 // Extend shift amounts.
8778 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8779 if (!IsShift)
8780 return SDValue();
8781 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8782 }
8783
8784 LHSElts.push_back(LHS);
8785 RHSElts.push_back(RHS);
8786 }
8787
8788 // Limit to shifts by uniform immediates.
8789 // TODO: Only accept vXi8/vXi64 special cases?
8790 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8791 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8792 return SDValue();
8793
8794 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8795 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8796 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8797
8798 if (!IsShift)
8799 return Res;
8800
8801 // Immediately lower the shift to ensure the constant build vector doesn't
8802 // get converted to a constant pool before the shift is lowered.
8803 return LowerShift(Res, Subtarget, DAG);
8804}
8805
8806static bool isShuffleFoldableLoad(SDValue);
8807
8808/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8809/// representing a blend.
8811 X86Subtarget const &Subtarget,
8812 SelectionDAG &DAG) {
8813 MVT VT = BVOp->getSimpleValueType(0u);
8814
8815 if (VT != MVT::v4f64)
8816 return SDValue();
8817
8818 // Collect unique operands.
8819 auto UniqueOps = SmallSet<SDValue, 16u>();
8820 for (SDValue Op : BVOp->ops()) {
8821 if (isIntOrFPConstant(Op) || Op.isUndef())
8822 return SDValue();
8823 UniqueOps.insert(Op);
8824 }
8825
8826 // Candidate BUILD_VECTOR must have 2 unique operands.
8827 if (UniqueOps.size() != 2u)
8828 return SDValue();
8829
8830 SDValue Op0 = BVOp->getOperand(0u);
8831 UniqueOps.erase(Op0);
8832 SDValue Op1 = *UniqueOps.begin();
8833
8834 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8835 isShuffleFoldableLoad(Op1)) {
8836 // Create shuffle mask.
8837 auto const NumElems = VT.getVectorNumElements();
8838 SmallVector<int, 16u> Mask(NumElems);
8839 for (auto I = 0u; I < NumElems; ++I) {
8840 SDValue Op = BVOp->getOperand(I);
8841 Mask[I] = Op == Op0 ? I : I + NumElems;
8842 }
8843 // Create shuffle of splats.
8844 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8845 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8846 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8847 }
8848
8849 return SDValue();
8850}
8851
8852/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8853/// functionality to do this, so it's all zeros, all ones, or some derivation
8854/// that is cheap to calculate.
8856 SelectionDAG &DAG,
8857 const X86Subtarget &Subtarget) {
8858 MVT VT = Op.getSimpleValueType();
8859
8860 // Vectors containing all zeros can be matched by pxor and xorps.
8861 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8862 return Op;
8863
8864 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8865 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8866 // vpcmpeqd on 256-bit vectors.
8867 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8868 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8869 return Op;
8870
8871 return getOnesVector(VT, DAG, DL);
8872 }
8873
8874 return SDValue();
8875}
8876
8877/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8878/// from a vector of source values and a vector of extraction indices.
8879/// The vectors might be manipulated to match the type of the permute op.
8880static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8881 const SDLoc &DL, SelectionDAG &DAG,
8882 const X86Subtarget &Subtarget) {
8883 MVT ShuffleVT = VT;
8884 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8885 unsigned NumElts = VT.getVectorNumElements();
8886 unsigned SizeInBits = VT.getSizeInBits();
8887
8888 // Adjust IndicesVec to match VT size.
8889 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8890 "Illegal variable permute mask size");
8891 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8892 // Narrow/widen the indices vector to the correct size.
8893 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8894 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8895 NumElts * VT.getScalarSizeInBits());
8896 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8897 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8898 SDLoc(IndicesVec), SizeInBits);
8899 // Zero-extend the index elements within the vector.
8900 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8901 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8902 IndicesVT, IndicesVec);
8903 }
8904 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8905
8906 // Handle SrcVec that don't match VT type.
8907 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8908 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8909 // Handle larger SrcVec by treating it as a larger permute.
8910 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8911 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8912 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8913 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8914 Subtarget, DAG, SDLoc(IndicesVec));
8915 SDValue NewSrcVec =
8916 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8917 if (NewSrcVec)
8918 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8919 return SDValue();
8920 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8921 // Widen smaller SrcVec to match VT.
8922 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8923 } else
8924 return SDValue();
8925 }
8926
8927 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8928 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8929 EVT SrcVT = Idx.getValueType();
8930 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8931 uint64_t IndexScale = 0;
8932 uint64_t IndexOffset = 0;
8933
8934 // If we're scaling a smaller permute op, then we need to repeat the
8935 // indices, scaling and offsetting them as well.
8936 // e.g. v4i32 -> v16i8 (Scale = 4)
8937 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8938 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8939 for (uint64_t i = 0; i != Scale; ++i) {
8940 IndexScale |= Scale << (i * NumDstBits);
8941 IndexOffset |= i << (i * NumDstBits);
8942 }
8943
8944 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8945 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8946 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8947 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8948 return Idx;
8949 };
8950
8951 unsigned Opcode = 0;
8952 switch (VT.SimpleTy) {
8953 default:
8954 break;
8955 case MVT::v16i8:
8956 if (Subtarget.hasSSSE3())
8957 Opcode = X86ISD::PSHUFB;
8958 break;
8959 case MVT::v8i16:
8960 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8961 Opcode = X86ISD::VPERMV;
8962 else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v4f32:
8968 case MVT::v4i32:
8969 if (Subtarget.hasAVX()) {
8970 Opcode = X86ISD::VPERMILPV;
8971 ShuffleVT = MVT::v4f32;
8972 } else if (Subtarget.hasSSSE3()) {
8973 Opcode = X86ISD::PSHUFB;
8974 ShuffleVT = MVT::v16i8;
8975 }
8976 break;
8977 case MVT::v2f64:
8978 case MVT::v2i64:
8979 if (Subtarget.hasAVX()) {
8980 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8981 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8982 Opcode = X86ISD::VPERMILPV;
8983 ShuffleVT = MVT::v2f64;
8984 } else if (Subtarget.hasSSE41()) {
8985 // SSE41 can compare v2i64 - select between indices 0 and 1.
8986 return DAG.getSelectCC(
8987 DL, IndicesVec,
8988 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8990 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8992 }
8993 break;
8994 case MVT::v32i8:
8995 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8996 Opcode = X86ISD::VPERMV;
8997 else if (Subtarget.hasXOP()) {
8998 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8999 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9000 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9001 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9002 return DAG.getNode(
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9005 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9006 } else if (Subtarget.hasAVX()) {
9007 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9008 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9009 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9010 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9011 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9013 // Permute Lo and Hi and then select based on index range.
9014 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9015 // care about the bit[7] as its just an index vector.
9016 SDValue Idx = Ops[2];
9017 EVT VT = Idx.getValueType();
9018 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9020 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9022 };
9023 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9024 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9025 PSHUFBBuilder);
9026 }
9027 break;
9028 case MVT::v16i16:
9029 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9030 Opcode = X86ISD::VPERMV;
9031 else if (Subtarget.hasAVX()) {
9032 // Scale to v32i8 and perform as v32i8.
9033 IndicesVec = ScaleIndices(IndicesVec, 2);
9034 return DAG.getBitcast(
9036 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9037 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9038 }
9039 break;
9040 case MVT::v8f32:
9041 case MVT::v8i32:
9042 if (Subtarget.hasAVX2())
9043 Opcode = X86ISD::VPERMV;
9044 else if (Subtarget.hasAVX()) {
9045 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9046 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9047 {0, 1, 2, 3, 0, 1, 2, 3});
9048 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9049 {4, 5, 6, 7, 4, 5, 6, 7});
9050 if (Subtarget.hasXOP())
9051 return DAG.getBitcast(
9052 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9053 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9054 // Permute Lo and Hi and then select based on index range.
9055 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9056 SDValue Res = DAG.getSelectCC(
9057 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9059 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9061 return DAG.getBitcast(VT, Res);
9062 }
9063 break;
9064 case MVT::v4i64:
9065 case MVT::v4f64:
9066 if (Subtarget.hasAVX512()) {
9067 if (!Subtarget.hasVLX()) {
9068 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9069 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9070 SDLoc(SrcVec));
9071 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9072 DAG, SDLoc(IndicesVec));
9073 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9074 DAG, Subtarget);
9075 return extract256BitVector(Res, 0, DAG, DL);
9076 }
9077 Opcode = X86ISD::VPERMV;
9078 } else if (Subtarget.hasAVX()) {
9079 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9080 SDValue LoLo =
9081 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9082 SDValue HiHi =
9083 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9084 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9085 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9086 if (Subtarget.hasXOP())
9087 return DAG.getBitcast(
9088 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9089 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9090 // Permute Lo and Hi and then select based on index range.
9091 // This works as VPERMILPD only uses index bit[1] to permute elements.
9092 SDValue Res = DAG.getSelectCC(
9093 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9095 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9097 return DAG.getBitcast(VT, Res);
9098 }
9099 break;
9100 case MVT::v64i8:
9101 if (Subtarget.hasVBMI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v32i16:
9105 if (Subtarget.hasBWI())
9106 Opcode = X86ISD::VPERMV;
9107 break;
9108 case MVT::v16f32:
9109 case MVT::v16i32:
9110 case MVT::v8f64:
9111 case MVT::v8i64:
9112 if (Subtarget.hasAVX512())
9113 Opcode = X86ISD::VPERMV;
9114 break;
9115 }
9116 if (!Opcode)
9117 return SDValue();
9118
9119 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9120 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9121 "Illegal variable permute shuffle type");
9122
9123 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9124 if (Scale > 1)
9125 IndicesVec = ScaleIndices(IndicesVec, Scale);
9126
9127 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9128 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9129
9130 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9131 SDValue Res = Opcode == X86ISD::VPERMV
9132 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9133 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9134 return DAG.getBitcast(VT, Res);
9135}
9136
9137// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9138// reasoned to be a permutation of a vector by indices in a non-constant vector.
9139// (build_vector (extract_elt V, (extract_elt I, 0)),
9140// (extract_elt V, (extract_elt I, 1)),
9141// ...
9142// ->
9143// (vpermv I, V)
9144//
9145// TODO: Handle undefs
9146// TODO: Utilize pshufb and zero mask blending to support more efficient
9147// construction of vectors with constant-0 elements.
9148static SDValue
9150 SelectionDAG &DAG,
9151 const X86Subtarget &Subtarget) {
9152 SDValue SrcVec, IndicesVec;
9153
9154 auto PeekThroughFreeze = [](SDValue N) {
9155 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9156 return N->getOperand(0);
9157 return N;
9158 };
9159 // Check for a match of the permute source vector and permute index elements.
9160 // This is done by checking that the i-th build_vector operand is of the form:
9161 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9162 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9163 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9164 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9165 return SDValue();
9166
9167 // If this is the first extract encountered in V, set the source vector,
9168 // otherwise verify the extract is from the previously defined source
9169 // vector.
9170 if (!SrcVec)
9171 SrcVec = Op.getOperand(0);
9172 else if (SrcVec != Op.getOperand(0))
9173 return SDValue();
9174 SDValue ExtractedIndex = Op->getOperand(1);
9175 // Peek through extends.
9176 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9177 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9178 ExtractedIndex = ExtractedIndex.getOperand(0);
9179 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9180 return SDValue();
9181
9182 // If this is the first extract from the index vector candidate, set the
9183 // indices vector, otherwise verify the extract is from the previously
9184 // defined indices vector.
9185 if (!IndicesVec)
9186 IndicesVec = ExtractedIndex.getOperand(0);
9187 else if (IndicesVec != ExtractedIndex.getOperand(0))
9188 return SDValue();
9189
9190 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9191 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9192 return SDValue();
9193 }
9194
9195 MVT VT = V.getSimpleValueType();
9196 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9197}
9198
9199SDValue
9200X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9201 SDLoc dl(Op);
9202
9203 MVT VT = Op.getSimpleValueType();
9204 MVT EltVT = VT.getVectorElementType();
9205 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9206 unsigned NumElems = Op.getNumOperands();
9207
9208 // Generate vectors for predicate vectors.
9209 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9210 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9211
9212 if (VT.getVectorElementType() == MVT::bf16 &&
9213 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9214 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9215
9216 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9217 return VectorCst;
9218
9219 unsigned EVTBits = EltVT.getSizeInBits();
9220 APInt UndefMask = APInt::getZero(NumElems);
9221 APInt FrozenUndefMask = APInt::getZero(NumElems);
9222 APInt ZeroMask = APInt::getZero(NumElems);
9223 APInt NonZeroMask = APInt::getZero(NumElems);
9224 bool IsAllConstants = true;
9225 bool OneUseFrozenUndefs = true;
9226 SmallSet<SDValue, 8> Values;
9227 unsigned NumConstants = NumElems;
9228 for (unsigned i = 0; i < NumElems; ++i) {
9229 SDValue Elt = Op.getOperand(i);
9230 if (Elt.isUndef()) {
9231 UndefMask.setBit(i);
9232 continue;
9233 }
9234 if (ISD::isFreezeUndef(Elt.getNode())) {
9235 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9236 FrozenUndefMask.setBit(i);
9237 continue;
9238 }
9239 Values.insert(Elt);
9240 if (!isIntOrFPConstant(Elt)) {
9241 IsAllConstants = false;
9242 NumConstants--;
9243 }
9244 if (X86::isZeroNode(Elt)) {
9245 ZeroMask.setBit(i);
9246 } else {
9247 NonZeroMask.setBit(i);
9248 }
9249 }
9250
9251 // All undef vector. Return an UNDEF.
9252 if (UndefMask.isAllOnes())
9253 return DAG.getUNDEF(VT);
9254
9255 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9256 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9257 return DAG.getFreeze(DAG.getUNDEF(VT));
9258
9259 // All undef/freeze(undef)/zero vector. Return a zero vector.
9260 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9261 return getZeroVector(VT, Subtarget, DAG, dl);
9262
9263 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9264 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9265 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9266 // and blend the FREEZE-UNDEF operands back in.
9267 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9268 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9269 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9270 SmallVector<int, 16> BlendMask(NumElems, -1);
9271 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9272 for (unsigned i = 0; i < NumElems; ++i) {
9273 if (UndefMask[i]) {
9274 BlendMask[i] = -1;
9275 continue;
9276 }
9277 BlendMask[i] = i;
9278 if (!FrozenUndefMask[i])
9279 Elts[i] = Op.getOperand(i);
9280 else
9281 BlendMask[i] += NumElems;
9282 }
9283 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9284 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9285 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9286 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9287 }
9288
9289 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9290
9291 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9292 // be better off lowering to a smaller build vector and padding with
9293 // undef/zero.
9294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9296 unsigned UpperElems = NumElems / 2;
9297 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9299 if (NumUpperUndefsOrZeros >= UpperElems) {
9300 if (VT.is512BitVector() &&
9301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9302 UpperElems = NumElems - (NumElems / 4);
9303 // If freeze(undef) is in any upper elements, force to zero.
9304 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9305 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9306 SDValue NewBV =
9307 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9308 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9309 }
9310 }
9311
9312 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9313 return AddSub;
9314 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9315 return HorizontalOp;
9316 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9317 return Broadcast;
9318 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9319 return BitOp;
9320 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9321 return Blend;
9322
9323 unsigned NumZero = ZeroMask.popcount();
9324 unsigned NumNonZero = NonZeroMask.popcount();
9325
9326 // If we are inserting one variable into a vector of non-zero constants, try
9327 // to avoid loading each constant element as a scalar. Load the constants as a
9328 // vector and then insert the variable scalar element. If insertion is not
9329 // supported, fall back to a shuffle to get the scalar blended with the
9330 // constants. Insertion into a zero vector is handled as a special-case
9331 // somewhere below here.
9332 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9333 FrozenUndefMask.isZero() &&
9336 // Create an all-constant vector. The variable element in the old
9337 // build vector is replaced by undef in the constant vector. Save the
9338 // variable scalar element and its index for use in the insertelement.
9339 LLVMContext &Context = *DAG.getContext();
9340 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9341 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9342 SDValue VarElt;
9343 SDValue InsIndex;
9344 for (unsigned i = 0; i != NumElems; ++i) {
9345 SDValue Elt = Op.getOperand(i);
9346 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9347 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9348 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9349 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9350 else if (!Elt.isUndef()) {
9351 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9352 "Expected one variable element in this vector");
9353 VarElt = Elt;
9354 InsIndex = DAG.getVectorIdxConstant(i, dl);
9355 }
9356 }
9357 Constant *CV = ConstantVector::get(ConstVecOps);
9358 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9359
9360 // The constants we just created may not be legal (eg, floating point). We
9361 // must lower the vector right here because we can not guarantee that we'll
9362 // legalize it before loading it. This is also why we could not just create
9363 // a new build vector here. If the build vector contains illegal constants,
9364 // it could get split back up into a series of insert elements.
9365 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9366 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9367 MachineFunction &MF = DAG.getMachineFunction();
9368 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9369 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9370 unsigned InsertC = InsIndex->getAsZExtVal();
9371 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9372 if (InsertC < NumEltsInLow128Bits)
9373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9374
9375 // There's no good way to insert into the high elements of a >128-bit
9376 // vector, so use shuffles to avoid an extract/insert sequence.
9377 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9378 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9379 SmallVector<int, 8> ShuffleMask;
9380 unsigned NumElts = VT.getVectorNumElements();
9381 for (unsigned i = 0; i != NumElts; ++i)
9382 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9383 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9384 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9385 }
9386
9387 // Special case for single non-zero, non-undef, element.
9388 if (NumNonZero == 1) {
9389 unsigned Idx = NonZeroMask.countr_zero();
9390 SDValue Item = Op.getOperand(Idx);
9391
9392 // If we have a constant or non-constant insertion into the low element of
9393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9394 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9395 // depending on what the source datatype is.
9396 if (Idx == 0) {
9397 if (NumZero == 0)
9398 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9399
9400 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9401 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9402 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9403 assert((VT.is128BitVector() || VT.is256BitVector() ||
9404 VT.is512BitVector()) &&
9405 "Expected an SSE value type!");
9406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9407 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9408 // zero vector.
9409 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9410 }
9411
9412 // We can't directly insert an i8 or i16 into a vector, so zero extend
9413 // it to i32 first.
9414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9415 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9419 return DAG.getBitcast(VT, Item);
9420 }
9421 }
9422
9423 // Is it a vector logical left shift?
9424 if (NumElems == 2 && Idx == 1 &&
9425 X86::isZeroNode(Op.getOperand(0)) &&
9426 !X86::isZeroNode(Op.getOperand(1))) {
9427 unsigned NumBits = VT.getSizeInBits();
9428 return getVShift(true, VT,
9430 VT, Op.getOperand(1)),
9431 NumBits/2, DAG, *this, dl);
9432 }
9433
9434 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9435 return SDValue();
9436
9437 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9438 // is a non-constant being inserted into an element other than the low one,
9439 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9440 // movd/movss) to move this into the low element, then shuffle it into
9441 // place.
9442 if (EVTBits == 32) {
9443 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9444 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9445 }
9446 }
9447
9448 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9449 if (Values.size() == 1) {
9450 if (EVTBits == 32) {
9451 // Instead of a shuffle like this:
9452 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9453 // Check if it's possible to issue this instead.
9454 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9455 unsigned Idx = NonZeroMask.countr_zero();
9456 SDValue Item = Op.getOperand(Idx);
9457 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9458 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9459 }
9460 return SDValue();
9461 }
9462
9463 // A vector full of immediates; various special cases are already
9464 // handled, so this is best done with a single constant-pool load.
9465 if (IsAllConstants)
9466 return SDValue();
9467
9468 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9469 return V;
9470
9471 // See if we can use a vector load to get all of the elements.
9472 {
9473 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9474 if (SDValue LD =
9475 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9476 return LD;
9477 }
9478
9479 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9480 // build_vector and broadcast it.
9481 // TODO: We could probably generalize this more.
9482 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9483 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9484 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9485 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9486 // Make sure all the even/odd operands match.
9487 for (unsigned i = 2; i != NumElems; ++i)
9488 if (Ops[i % 2] != Op.getOperand(i))
9489 return false;
9490 return true;
9491 };
9492 if (CanSplat(Op, NumElems, Ops)) {
9493 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9494 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9495 // Create a new build vector and cast to v2i64/v2f64.
9496 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9497 DAG.getBuildVector(NarrowVT, dl, Ops));
9498 // Broadcast from v2i64/v2f64 and cast to final VT.
9499 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9500 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9501 NewBV));
9502 }
9503 }
9504
9505 // For AVX-length vectors, build the individual 128-bit pieces and use
9506 // shuffles to put them in place.
9507 if (VT.getSizeInBits() > 128) {
9508 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9509
9510 // Build both the lower and upper subvector.
9511 SDValue Lower =
9512 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9514 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9515
9516 // Recreate the wider vector with the lower and upper part.
9517 return concatSubVectors(Lower, Upper, DAG, dl);
9518 }
9519
9520 // Let legalizer expand 2-wide build_vectors.
9521 if (EVTBits == 64) {
9522 if (NumNonZero == 1) {
9523 // One half is zero or undef.
9524 unsigned Idx = NonZeroMask.countr_zero();
9525 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9526 Op.getOperand(Idx));
9527 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9528 }
9529 return SDValue();
9530 }
9531
9532 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9533 if (EVTBits == 8 && NumElems == 16)
9534 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9535 NumZero, DAG, Subtarget))
9536 return V;
9537
9538 if (EltVT == MVT::i16 && NumElems == 8)
9539 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9540 NumZero, DAG, Subtarget))
9541 return V;
9542
9543 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9544 if (EVTBits == 32 && NumElems == 4)
9545 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9546 return V;
9547
9548 // If element VT is == 32 bits, turn it into a number of shuffles.
9549 if (NumElems == 4 && NumZero > 0) {
9550 SmallVector<SDValue, 8> Ops(NumElems);
9551 for (unsigned i = 0; i < 4; ++i) {
9552 bool isZero = !NonZeroMask[i];
9553 if (isZero)
9554 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9555 else
9556 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9557 }
9558
9559 for (unsigned i = 0; i < 2; ++i) {
9560 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9561 default: llvm_unreachable("Unexpected NonZero count");
9562 case 0:
9563 Ops[i] = Ops[i*2]; // Must be a zero vector.
9564 break;
9565 case 1:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9567 break;
9568 case 2:
9569 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 case 3:
9572 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9573 break;
9574 }
9575 }
9576
9577 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9578 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9579 int MaskVec[] = {
9580 Reverse1 ? 1 : 0,
9581 Reverse1 ? 0 : 1,
9582 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9583 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9584 };
9585 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9586 }
9587
9588 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9589
9590 // Check for a build vector from mostly shuffle plus few inserting.
9591 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9592 return Sh;
9593
9594 // For SSE 4.1, use insertps to put the high elements into the low element.
9595 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9597 if (!Op.getOperand(0).isUndef())
9598 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9599 else
9600 Result = DAG.getUNDEF(VT);
9601
9602 for (unsigned i = 1; i < NumElems; ++i) {
9603 if (Op.getOperand(i).isUndef()) continue;
9604 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9605 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9606 }
9607 return Result;
9608 }
9609
9610 // Otherwise, expand into a number of unpckl*, start by extending each of
9611 // our (non-undef) elements to the full vector width with the element in the
9612 // bottom slot of the vector (which generates no code for SSE).
9613 SmallVector<SDValue, 8> Ops(NumElems);
9614 for (unsigned i = 0; i < NumElems; ++i) {
9615 if (!Op.getOperand(i).isUndef())
9616 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9617 else
9618 Ops[i] = DAG.getUNDEF(VT);
9619 }
9620
9621 // Next, we iteratively mix elements, e.g. for v4f32:
9622 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9623 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9624 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9625 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9626 // Generate scaled UNPCKL shuffle mask.
9627 SmallVector<int, 16> Mask;
9628 for(unsigned i = 0; i != Scale; ++i)
9629 Mask.push_back(i);
9630 for (unsigned i = 0; i != Scale; ++i)
9631 Mask.push_back(NumElems+i);
9632 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9633
9634 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9635 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9636 }
9637 return Ops[0];
9638}
9639
9640// 256-bit AVX can use the vinsertf128 instruction
9641// to create 256-bit vectors from two other 128-bit ones.
9642// TODO: Detect subvector broadcast here instead of DAG combine?
9644 SelectionDAG &DAG,
9645 const X86Subtarget &Subtarget) {
9646 MVT ResVT = Op.getSimpleValueType();
9647 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9648 "Value type must be 256-/512-bit wide");
9649
9650 unsigned NumOperands = Op.getNumOperands();
9651 unsigned NumFreezeUndef = 0;
9652 unsigned NumZero = 0;
9653 unsigned NumNonZero = 0;
9654 unsigned NonZeros = 0;
9655 SmallSet<SDValue, 4> Undefs;
9656 for (unsigned i = 0; i != NumOperands; ++i) {
9657 SDValue SubVec = Op.getOperand(i);
9658 if (SubVec.isUndef())
9659 continue;
9660 if (ISD::isFreezeUndef(SubVec.getNode())) {
9661 // If the freeze(undef) has multiple uses then we must fold to zero.
9662 if (SubVec.hasOneUse()) {
9663 ++NumFreezeUndef;
9664 } else {
9665 ++NumZero;
9666 Undefs.insert(SubVec);
9667 }
9668 }
9669 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9670 ++NumZero;
9671 else {
9672 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9673 NonZeros |= 1 << i;
9674 ++NumNonZero;
9675 }
9676 }
9677
9678 // If we have more than 2 non-zeros, build each half separately.
9679 if (NumNonZero > 2) {
9680 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9681 ArrayRef<SDUse> Ops = Op->ops();
9682 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9683 Ops.slice(0, NumOperands/2));
9684 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9685 Ops.slice(NumOperands/2));
9686 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9687 }
9688
9689 // Otherwise, build it up through insert_subvectors.
9690 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9691 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9692 : DAG.getUNDEF(ResVT));
9693
9694 // Replace Undef operands with ZeroVector.
9695 for (SDValue U : Undefs)
9697 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9698
9699 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9700 unsigned NumSubElems = SubVT.getVectorNumElements();
9701 for (unsigned i = 0; i != NumOperands; ++i) {
9702 if ((NonZeros & (1 << i)) == 0)
9703 continue;
9704
9705 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9706 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9707 }
9708
9709 return Vec;
9710}
9711
9712// Returns true if the given node is a type promotion (by concatenating i1
9713// zeros) of the result of a node that already zeros all upper bits of
9714// k-register.
9715// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG & DAG) {
9719 MVT ResVT = Op.getSimpleValueType();
9720 unsigned NumOperands = Op.getNumOperands();
9721 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9722 "Unexpected number of operands in CONCAT_VECTORS");
9723
9724 uint64_t Zeros = 0;
9725 uint64_t NonZeros = 0;
9726 for (unsigned i = 0; i != NumOperands; ++i) {
9727 SDValue SubVec = Op.getOperand(i);
9728 if (SubVec.isUndef())
9729 continue;
9730 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9731 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9732 Zeros |= (uint64_t)1 << i;
9733 else
9734 NonZeros |= (uint64_t)1 << i;
9735 }
9736
9737 unsigned NumElems = ResVT.getVectorNumElements();
9738
9739 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9740 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9741 // insert_subvector will give us two kshifts.
9742 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9743 Log2_64(NonZeros) != NumOperands - 1) {
9744 unsigned Idx = Log2_64(NonZeros);
9745 SDValue SubVec = Op.getOperand(Idx);
9746 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9747 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9748 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9749 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9750 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9751 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9752 DAG.getVectorIdxConstant(0, dl));
9753 }
9754
9755 // If there are zero or one non-zeros we can handle this very simply.
9756 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9757 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9758 if (!NonZeros)
9759 return Vec;
9760 unsigned Idx = Log2_64(NonZeros);
9761 SDValue SubVec = Op.getOperand(Idx);
9762 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9763 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9764 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9765 }
9766
9767 if (NumOperands > 2) {
9768 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9769 ArrayRef<SDUse> Ops = Op->ops();
9770 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9771 Ops.slice(0, NumOperands / 2));
9772 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9773 Ops.slice(NumOperands / 2));
9774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9775 }
9776
9777 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9778
9779 if (ResVT.getVectorNumElements() >= 16)
9780 return Op; // The operation is legal with KUNPCK
9781
9782 SDValue Vec =
9783 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9784 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9785 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9786 DAG.getVectorIdxConstant(NumElems / 2, dl));
9787}
9788
9790 const X86Subtarget &Subtarget,
9791 SelectionDAG &DAG) {
9792 SDLoc DL(Op);
9793 MVT VT = Op.getSimpleValueType();
9794 if (VT.getVectorElementType() == MVT::i1)
9795 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9796
9797 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9798 // from two other 128-bit ones.
9799 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9800 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9801 (VT.is512BitVector() &&
9802 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9803 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9804}
9805
9806//===----------------------------------------------------------------------===//
9807// Vector shuffle lowering
9808//
9809// This is an experimental code path for lowering vector shuffles on x86. It is
9810// designed to handle arbitrary vector shuffles and blends, gracefully
9811// degrading performance as necessary. It works hard to recognize idiomatic
9812// shuffles and lower them to optimal instruction patterns without leaving
9813// a framework that allows reasonably efficient handling of all vector shuffle
9814// patterns.
9815//===----------------------------------------------------------------------===//
9816
9817/// Checks whether the vector elements referenced by two shuffle masks are
9818/// equivalent.
9819static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9820 int Idx, int ExpectedIdx) {
9821 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9822 ExpectedIdx < MaskSize && "Out of range element index");
9823 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9824 return false;
9825
9826 EVT VT = Op.getValueType();
9827 EVT ExpectedVT = ExpectedOp.getValueType();
9828
9829 // Sources must be vectors and match the mask's element count.
9830 if (!VT.isVector() || !ExpectedVT.isVector() ||
9831 (int)VT.getVectorNumElements() != MaskSize ||
9832 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9833 return false;
9834
9835 // Exact match.
9836 if (Idx == ExpectedIdx && Op == ExpectedOp)
9837 return true;
9838
9839 switch (Op.getOpcode()) {
9840 case ISD::BUILD_VECTOR:
9841 // If the values are build vectors, we can look through them to find
9842 // equivalent inputs that make the shuffles equivalent.
9843 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9844 case ISD::BITCAST: {
9846 EVT SrcVT = Src.getValueType();
9847 if (Op == ExpectedOp && SrcVT.isVector()) {
9848 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9849 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9850 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9851 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9852 Idx / Scale, ExpectedIdx / Scale);
9853 }
9854 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9856 for (unsigned I = 0; I != Scale; ++I)
9857 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 (Idx * Scale) + I,
9859 (ExpectedIdx * Scale) + I))
9860 return false;
9861 return true;
9862 }
9863 }
9864 break;
9865 }
9866 case ISD::VECTOR_SHUFFLE: {
9867 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9868 return Op == ExpectedOp &&
9869 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9870 }
9871 case X86ISD::VBROADCAST:
9873 return Op == ExpectedOp;
9875 if (Op == ExpectedOp) {
9876 auto *MemOp = cast<MemSDNode>(Op);
9877 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9878 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9879 }
9880 break;
9881 case X86ISD::VPERMI: {
9882 if (Op == ExpectedOp) {
9884 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9885 SDValue Src = Op.getOperand(0);
9886 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9887 Mask[ExpectedIdx]);
9888 }
9889 break;
9890 }
9891 case X86ISD::HADD:
9892 case X86ISD::HSUB:
9893 case X86ISD::FHADD:
9894 case X86ISD::FHSUB:
9895 case X86ISD::PACKSS:
9896 case X86ISD::PACKUS:
9897 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9898 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9899 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9900 int NumElts = VT.getVectorNumElements();
9901 int NumLanes = VT.getSizeInBits() / 128;
9902 int NumEltsPerLane = NumElts / NumLanes;
9903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9904 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9905 bool SameElt =
9906 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9907 return SameLane && SameElt;
9908 }
9909 break;
9910 }
9911
9912 return false;
9913}
9914
9915/// Tiny helper function to identify a no-op mask.
9916///
9917/// This is a somewhat boring predicate function. It checks whether the mask
9918/// array input, which is assumed to be a single-input shuffle mask of the kind
9919/// used by the X86 shuffle instructions (not a fully general
9920/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9921/// in-place shuffle are 'no-op's.
9923 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9924 assert(Mask[i] >= -1 && "Out of bound mask element!");
9925 if (Mask[i] >= 0 && Mask[i] != i)
9926 return false;
9927 }
9928 return true;
9929}
9930
9931/// Test whether there are elements crossing LaneSizeInBits lanes in this
9932/// shuffle mask.
9933///
9934/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9935/// and we routinely test for these.
9936static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9937 unsigned ScalarSizeInBits,
9938 ArrayRef<int> Mask) {
9939 assert(LaneSizeInBits && ScalarSizeInBits &&
9940 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9941 "Illegal shuffle lane size");
9942 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9943 int Size = Mask.size();
9944 for (int i = 0; i < Size; ++i)
9945 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9946 return true;
9947 return false;
9948}
9949
9950/// Test whether there are elements crossing 128-bit lanes in this
9951/// shuffle mask.
9953 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9954}
9955
9956/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9957/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9958/// better support 'repeated mask + lane permute' style shuffles.
9959static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9960 unsigned ScalarSizeInBits,
9961 ArrayRef<int> Mask) {
9962 assert(LaneSizeInBits && ScalarSizeInBits &&
9963 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9964 "Illegal shuffle lane size");
9965 int NumElts = Mask.size();
9966 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9967 int NumLanes = NumElts / NumEltsPerLane;
9968 if (NumLanes > 1) {
9969 for (int i = 0; i != NumLanes; ++i) {
9970 int SrcLane = -1;
9971 for (int j = 0; j != NumEltsPerLane; ++j) {
9972 int M = Mask[(i * NumEltsPerLane) + j];
9973 if (M < 0)
9974 continue;
9975 int Lane = (M % NumElts) / NumEltsPerLane;
9976 if (SrcLane >= 0 && SrcLane != Lane)
9977 return true;
9978 SrcLane = Lane;
9979 }
9980 }
9981 }
9982 return false;
9983}
9984
9985/// Test whether a shuffle mask is equivalent within each sub-lane.
9986///
9987/// This checks a shuffle mask to see if it is performing the same
9988/// lane-relative shuffle in each sub-lane. This trivially implies
9989/// that it is also not lane-crossing. It may however involve a blend from the
9990/// same lane of a second vector.
9991///
9992/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9993/// non-trivial to compute in the face of undef lanes. The representation is
9994/// suitable for use with existing 128-bit shuffles as entries from the second
9995/// vector have been remapped to [LaneSize, 2*LaneSize).
9996static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9997 ArrayRef<int> Mask,
9998 SmallVectorImpl<int> &RepeatedMask) {
9999 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10000 RepeatedMask.assign(LaneSize, -1);
10001 int Size = Mask.size();
10002 for (int i = 0; i < Size; ++i) {
10003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10004 if (Mask[i] < 0)
10005 continue;
10006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10007 // This entry crosses lanes, so there is no way to model this shuffle.
10008 return false;
10009
10010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10011 // Adjust second vector indices to start at LaneSize instead of Size.
10012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10013 : Mask[i] % LaneSize + LaneSize;
10014 if (RepeatedMask[i % LaneSize] < 0)
10015 // This is the first non-undef entry in this slot of a 128-bit lane.
10016 RepeatedMask[i % LaneSize] = LocalM;
10017 else if (RepeatedMask[i % LaneSize] != LocalM)
10018 // Found a mismatch with the repeated mask.
10019 return false;
10020 }
10021 return true;
10022}
10023
10024/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10025static bool
10027 SmallVectorImpl<int> &RepeatedMask) {
10028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10029}
10030
10031static bool
10033 SmallVector<int, 32> RepeatedMask;
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10038static bool
10040 SmallVectorImpl<int> &RepeatedMask) {
10041 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10042}
10043
10044/// Test whether a target shuffle mask is equivalent within each sub-lane.
10045/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10046static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10047 unsigned EltSizeInBits,
10048 ArrayRef<int> Mask,
10049 SmallVectorImpl<int> &RepeatedMask) {
10050 int LaneSize = LaneSizeInBits / EltSizeInBits;
10051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10052 int Size = Mask.size();
10053 for (int i = 0; i < Size; ++i) {
10054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10055 if (Mask[i] == SM_SentinelUndef)
10056 continue;
10057 if (Mask[i] == SM_SentinelZero) {
10058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10059 return false;
10060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10061 continue;
10062 }
10063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10064 // This entry crosses lanes, so there is no way to model this shuffle.
10065 return false;
10066
10067 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10068 // later vector indices to start at multiples of LaneSize instead of Size.
10069 int LaneM = Mask[i] / Size;
10070 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10072 // This is the first non-undef entry in this slot of a 128-bit lane.
10073 RepeatedMask[i % LaneSize] = LocalM;
10074 else if (RepeatedMask[i % LaneSize] != LocalM)
10075 // Found a mismatch with the repeated mask.
10076 return false;
10077 }
10078 return true;
10079}
10080
10081/// Test whether a target shuffle mask is equivalent within each sub-lane.
10082/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10083static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10084 ArrayRef<int> Mask,
10085 SmallVectorImpl<int> &RepeatedMask) {
10086 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10087 Mask, RepeatedMask);
10088}
10089
10090/// Checks whether a shuffle mask is equivalent to an explicit list of
10091/// arguments.
10092///
10093/// This is a fast way to test a shuffle mask against a fixed pattern:
10094///
10095/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10096///
10097/// It returns true if the mask is exactly as wide as the argument list, and
10098/// each element of the mask is either -1 (signifying undef) or the value given
10099/// in the argument.
10100static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10101 SDValue V1 = SDValue(),
10102 SDValue V2 = SDValue()) {
10103 int Size = Mask.size();
10104 if (Size != (int)ExpectedMask.size())
10105 return false;
10106
10107 for (int i = 0; i < Size; ++i) {
10108 assert(Mask[i] >= -1 && "Out of bound mask element!");
10109 int MaskIdx = Mask[i];
10110 int ExpectedIdx = ExpectedMask[i];
10111 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10112 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10113 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10114 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10115 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10116 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10117 return false;
10118 }
10119 }
10120 return true;
10121}
10122
10123/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10124///
10125/// The masks must be exactly the same width.
10126///
10127/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10128/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10129///
10130/// SM_SentinelZero is accepted as a valid negative index but must match in
10131/// both, or via a known bits test.
10133 ArrayRef<int> ExpectedMask,
10134 const SelectionDAG &DAG,
10135 SDValue V1 = SDValue(),
10136 SDValue V2 = SDValue()) {
10137 int Size = Mask.size();
10138 if (Size != (int)ExpectedMask.size())
10139 return false;
10140 assert(llvm::all_of(ExpectedMask,
10141 [Size](int M) {
10142 return M == SM_SentinelZero ||
10143 isInRange(M, 0, 2 * Size);
10144 }) &&
10145 "Illegal target shuffle mask");
10146
10147 // Check for out-of-range target shuffle mask indices.
10148 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10149 return false;
10150
10151 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10152 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V1.getValueType().isVector()))
10154 V1 = SDValue();
10155 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10156 !V2.getValueType().isVector()))
10157 V2 = SDValue();
10158
10159 APInt ZeroV1 = APInt::getZero(Size);
10160 APInt ZeroV2 = APInt::getZero(Size);
10161
10162 for (int i = 0; i < Size; ++i) {
10163 int MaskIdx = Mask[i];
10164 int ExpectedIdx = ExpectedMask[i];
10165 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10166 continue;
10167 // If we failed to match an expected SM_SentinelZero then early out.
10168 if (ExpectedIdx < 0)
10169 return false;
10170 if (MaskIdx == SM_SentinelZero) {
10171 // If we need this expected index to be a zero element, then update the
10172 // relevant zero mask and perform the known bits at the end to minimize
10173 // repeated computes.
10174 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10175 if (ExpectedV &&
10176 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10177 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10179 ZeroMask.setBit(BitIdx);
10180 continue;
10181 }
10182 }
10183 if (MaskIdx >= 0) {
10184 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10185 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10186 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10187 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10188 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10189 continue;
10190 }
10191 return false;
10192 }
10193 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10194 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10195}
10196
10197// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10198// instructions.
10200 const SelectionDAG &DAG) {
10201 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10202 return false;
10203
10204 SmallVector<int, 8> Unpcklwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10206 /* Unary = */ false);
10207 SmallVector<int, 8> Unpckhwd;
10208 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10209 /* Unary = */ false);
10210 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10211 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10212 return IsUnpackwdMask;
10213}
10214
10216 const SelectionDAG &DAG) {
10217 // Create 128-bit vector type based on mask size.
10218 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10219 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10220
10221 // We can't assume a canonical shuffle mask, so try the commuted version too.
10222 SmallVector<int, 4> CommutedMask(Mask);
10224
10225 // Match any of unary/binary or low/high.
10226 for (unsigned i = 0; i != 4; ++i) {
10227 SmallVector<int, 16> UnpackMask;
10228 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10229 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10230 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10231 return true;
10232 }
10233 return false;
10234}
10235
10236/// Return true if a shuffle mask chooses elements identically in its top and
10237/// bottom halves. For example, any splat mask has the same top and bottom
10238/// halves. If an element is undefined in only one half of the mask, the halves
10239/// are not considered identical.
10241 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10242 unsigned HalfSize = Mask.size() / 2;
10243 for (unsigned i = 0; i != HalfSize; ++i) {
10244 if (Mask[i] != Mask[i + HalfSize])
10245 return false;
10246 }
10247 return true;
10248}
10249
10250/// Get a 4-lane 8-bit shuffle immediate for a mask.
10251///
10252/// This helper function produces an 8-bit shuffle immediate corresponding to
10253/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10254/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10255/// example.
10256///
10257/// NB: We rely heavily on "undef" masks preserving the input lane.
10258static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10259 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10260 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10261 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10262 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10263 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10264
10265 // If the mask only uses one non-undef element, then fully 'splat' it to
10266 // improve later broadcast matching.
10267 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10268 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10269
10270 int FirstElt = Mask[FirstIndex];
10271 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10272 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10273
10274 unsigned Imm = 0;
10275 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10276 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10277 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10278 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10279 return Imm;
10280}
10281
10283 SelectionDAG &DAG) {
10284 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10285}
10286
10287// Canonicalize SHUFPD mask to improve chances of further folding.
10288// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10289static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10290 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10291 "Unexpected SHUFPD mask size");
10292 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10293 "Unexpected SHUFPD mask elements");
10294
10295 // If the mask only uses one non-undef element, then fully 'splat' it to
10296 // improve later broadcast matching.
10297 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10298 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10299 "All undef shuffle mask");
10300
10301 int FirstElt = Mask[FirstIndex];
10302 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10303 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10304 unsigned Imm = 0;
10305 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10306 Imm |= FirstElt << I;
10307 return Imm;
10308 }
10309
10310 // Attempt to keep any undef elements in place to improve chances of the
10311 // shuffle becoming a (commutative) blend.
10312 unsigned Imm = 0;
10313 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10314 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10315
10316 return Imm;
10317}
10318
10320 SelectionDAG &DAG) {
10321 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10322}
10323
10324// The Shuffle result is as follow:
10325// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10326// Each Zeroable's element correspond to a particular Mask's element.
10327// As described in computeZeroableShuffleElements function.
10328//
10329// The function looks for a sub-mask that the nonzero elements are in
10330// increasing order. If such sub-mask exist. The function returns true.
10331static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10332 ArrayRef<int> Mask, const EVT &VectorType,
10333 bool &IsZeroSideLeft) {
10334 int NextElement = -1;
10335 // Check if the Mask's nonzero elements are in increasing order.
10336 for (int i = 0, e = Mask.size(); i < e; i++) {
10337 // Checks if the mask's zeros elements are built from only zeros.
10338 assert(Mask[i] >= -1 && "Out of bound mask element!");
10339 if (Mask[i] < 0)
10340 return false;
10341 if (Zeroable[i])
10342 continue;
10343 // Find the lowest non zero element
10344 if (NextElement < 0) {
10345 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10346 IsZeroSideLeft = NextElement != 0;
10347 }
10348 // Exit if the mask's non zero elements are not in increasing order.
10349 if (NextElement != Mask[i])
10350 return false;
10351 NextElement++;
10352 }
10353 return true;
10354}
10355
10356static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10358 const X86Subtarget &Subtarget,
10359 unsigned Depth = 0);
10360
10361/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10363 ArrayRef<int> Mask, SDValue V1,
10364 SDValue V2, const APInt &Zeroable,
10365 const X86Subtarget &Subtarget,
10366 SelectionDAG &DAG) {
10367 int Size = Mask.size();
10368 int LaneSize = 128 / VT.getScalarSizeInBits();
10369 const int NumBytes = VT.getSizeInBits() / 8;
10370 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10371
10372 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10373 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10374 (Subtarget.hasBWI() && VT.is512BitVector()));
10375
10376 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10377 // Sign bit set in i8 mask means zero element.
10378 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10379
10380 SDValue V;
10381 for (int i = 0; i < NumBytes; ++i) {
10382 int M = Mask[i / NumEltBytes];
10383 if (M < 0) {
10384 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10385 continue;
10386 }
10387 if (Zeroable[i / NumEltBytes]) {
10388 PSHUFBMask[i] = ZeroMask;
10389 continue;
10390 }
10391
10392 // We can only use a single input of V1 or V2.
10393 SDValue SrcV = (M >= Size ? V2 : V1);
10394 if (V && V != SrcV)
10395 return SDValue();
10396 V = SrcV;
10397 M %= Size;
10398
10399 // PSHUFB can't cross lanes, ensure this doesn't happen.
10400 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10401 return SDValue();
10402
10403 M = M % LaneSize;
10404 M = M * NumEltBytes + (i % NumEltBytes);
10405 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10406 }
10407 assert(V && "Failed to find a source input");
10408
10409 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10410 return DAG.getBitcast(
10411 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10412 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10413}
10414
10415static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10416 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10417 const SDLoc &dl);
10418
10419// X86 has dedicated shuffle that can be lowered to VEXPAND
10421 SDValue V2, ArrayRef<int> Mask,
10422 const APInt &Zeroable,
10423 const X86Subtarget &Subtarget,
10424 SelectionDAG &DAG) {
10425 bool IsLeftZeroSide = true;
10426 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10427 IsLeftZeroSide))
10428 return SDValue();
10429 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10431 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10432 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10433 unsigned NumElts = VT.getVectorNumElements();
10434 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10435 "Unexpected number of vector elements");
10436 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10437 Subtarget, DAG, DL);
10438 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10439 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10440 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10441}
10442
10443static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10444 unsigned &UnpackOpcode, bool IsUnary,
10445 ArrayRef<int> TargetMask, const SDLoc &DL,
10446 SelectionDAG &DAG,
10447 const X86Subtarget &Subtarget) {
10448 int NumElts = VT.getVectorNumElements();
10449
10450 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10451 for (int i = 0; i != NumElts; i += 2) {
10452 int M1 = TargetMask[i + 0];
10453 int M2 = TargetMask[i + 1];
10454 Undef1 &= (SM_SentinelUndef == M1);
10455 Undef2 &= (SM_SentinelUndef == M2);
10456 Zero1 &= isUndefOrZero(M1);
10457 Zero2 &= isUndefOrZero(M2);
10458 }
10459 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10460 "Zeroable shuffle detected");
10461
10462 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10463 SmallVector<int, 64> Unpckl, Unpckh;
10464 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10465 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10466 (IsUnary ? V1 : V2))) {
10467 UnpackOpcode = X86ISD::UNPCKL;
10468 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10469 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10470 return true;
10471 }
10472
10473 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10474 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10475 (IsUnary ? V1 : V2))) {
10476 UnpackOpcode = X86ISD::UNPCKH;
10477 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10478 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10479 return true;
10480 }
10481
10482 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10483 if (IsUnary && (Zero1 || Zero2)) {
10484 // Don't bother if we can blend instead.
10485 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10486 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10487 return false;
10488
10489 bool MatchLo = true, MatchHi = true;
10490 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10491 int M = TargetMask[i];
10492
10493 // Ignore if the input is known to be zero or the index is undef.
10494 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10495 (M == SM_SentinelUndef))
10496 continue;
10497
10498 MatchLo &= (M == Unpckl[i]);
10499 MatchHi &= (M == Unpckh[i]);
10500 }
10501
10502 if (MatchLo || MatchHi) {
10503 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10504 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10506 return true;
10507 }
10508 }
10509
10510 // If a binary shuffle, commute and try again.
10511 if (!IsUnary) {
10513 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10514 UnpackOpcode = X86ISD::UNPCKL;
10515 std::swap(V1, V2);
10516 return true;
10517 }
10518
10520 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10521 UnpackOpcode = X86ISD::UNPCKH;
10522 std::swap(V1, V2);
10523 return true;
10524 }
10525 }
10526
10527 return false;
10528}
10529
10530// X86 has dedicated unpack instructions that can handle specific blend
10531// operations: UNPCKH and UNPCKL.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 SmallVector<int, 8> Unpckl;
10536 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10539
10540 SmallVector<int, 8> Unpckh;
10541 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10542 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10544
10545 // Commute and try again.
10547 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10549
10551 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10553
10554 return SDValue();
10555}
10556
10557/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10558/// followed by unpack 256-bit.
10560 SDValue V2, ArrayRef<int> Mask,
10561 SelectionDAG &DAG) {
10562 SmallVector<int, 32> Unpckl, Unpckh;
10563 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10564 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10565
10566 unsigned UnpackOpcode;
10567 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10568 UnpackOpcode = X86ISD::UNPCKL;
10569 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10570 UnpackOpcode = X86ISD::UNPCKH;
10571 else
10572 return SDValue();
10573
10574 // This is a "natural" unpack operation (rather than the 128-bit sectored
10575 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10576 // input in order to use the x86 instruction.
10577 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10578 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10579 V1 = DAG.getBitcast(VT, V1);
10580 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10581}
10582
10583// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10584// source into the lower elements and zeroing the upper elements.
10585static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10586 ArrayRef<int> Mask, const APInt &Zeroable,
10587 const X86Subtarget &Subtarget) {
10588 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10589 return false;
10590
10591 unsigned NumElts = Mask.size();
10592 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10593 unsigned MaxScale = 64 / EltSizeInBits;
10594
10595 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10596 unsigned SrcEltBits = EltSizeInBits * Scale;
10597 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10598 continue;
10599 unsigned NumSrcElts = NumElts / Scale;
10600 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10601 continue;
10602 unsigned UpperElts = NumElts - NumSrcElts;
10603 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10604 continue;
10605 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10606 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10607 DstVT = MVT::getIntegerVT(EltSizeInBits);
10608 if ((NumSrcElts * EltSizeInBits) >= 128) {
10609 // ISD::TRUNCATE
10610 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10611 } else {
10612 // X86ISD::VTRUNC
10613 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10614 }
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10621// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10622// element padding to the final DstVT.
10623static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10624 const X86Subtarget &Subtarget,
10625 SelectionDAG &DAG, bool ZeroUppers) {
10626 MVT SrcVT = Src.getSimpleValueType();
10627 MVT DstSVT = DstVT.getScalarType();
10628 unsigned NumDstElts = DstVT.getVectorNumElements();
10629 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10630 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10631
10632 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10633 return SDValue();
10634
10635 // Perform a direct ISD::TRUNCATE if possible.
10636 if (NumSrcElts == NumDstElts)
10637 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10638
10639 if (NumSrcElts > NumDstElts) {
10640 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10641 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10642 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10643 }
10644
10645 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10649 DstVT.getSizeInBits());
10650 }
10651
10652 // Non-VLX targets must truncate from a 512-bit type, so we need to
10653 // widen, truncate and then possibly extract the original subvector.
10654 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10655 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10656 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10657 }
10658
10659 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10660 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10661 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10662 if (DstVT != TruncVT)
10663 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10664 DstVT.getSizeInBits());
10665 return Trunc;
10666}
10667
10668// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10669//
10670// An example is the following:
10671//
10672// t0: ch = EntryToken
10673// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10674// t25: v4i32 = truncate t2
10675// t41: v8i16 = bitcast t25
10676// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10677// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10678// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10679// t18: v2i64 = bitcast t51
10680//
10681// One can just use a single vpmovdw instruction, without avx512vl we need to
10682// use the zmm variant and extract the lower subvector, padding with zeroes.
10683// TODO: Merge with lowerShuffleAsVTRUNC.
10685 SDValue V2, ArrayRef<int> Mask,
10686 const APInt &Zeroable,
10687 const X86Subtarget &Subtarget,
10688 SelectionDAG &DAG) {
10689 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10690 if (!Subtarget.hasAVX512())
10691 return SDValue();
10692
10693 unsigned NumElts = VT.getVectorNumElements();
10694 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10695 unsigned MaxScale = 64 / EltSizeInBits;
10696 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10697 unsigned SrcEltBits = EltSizeInBits * Scale;
10698 unsigned NumSrcElts = NumElts / Scale;
10699 unsigned UpperElts = NumElts - NumSrcElts;
10700 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10701 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10702 continue;
10703
10704 // Attempt to find a matching source truncation, but as a fall back VLX
10705 // cases can use the VPMOV directly.
10706 SDValue Src = peekThroughBitcasts(V1);
10707 if (Src.getOpcode() == ISD::TRUNCATE &&
10708 Src.getScalarValueSizeInBits() == SrcEltBits) {
10709 Src = Src.getOperand(0);
10710 } else if (Subtarget.hasVLX()) {
10711 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10713 Src = DAG.getBitcast(SrcVT, Src);
10714 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10715 if (Scale == 2 &&
10716 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10717 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10718 return SDValue();
10719 } else
10720 return SDValue();
10721
10722 // VPMOVWB is only available with avx512bw.
10723 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10724 return SDValue();
10725
10726 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10727 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10728 }
10729
10730 return SDValue();
10731}
10732
10733// Attempt to match binary shuffle patterns as a truncate.
10735 SDValue V2, ArrayRef<int> Mask,
10736 const APInt &Zeroable,
10737 const X86Subtarget &Subtarget,
10738 SelectionDAG &DAG) {
10739 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10740 "Unexpected VTRUNC type");
10741 if (!Subtarget.hasAVX512() ||
10742 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10743 return SDValue();
10744
10745 unsigned NumElts = VT.getVectorNumElements();
10746 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10747 unsigned MaxScale = 64 / EltSizeInBits;
10748 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10749 // TODO: Support non-BWI VPMOVWB truncations?
10750 unsigned SrcEltBits = EltSizeInBits * Scale;
10751 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10752 continue;
10753
10754 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10755 // Bail if the V2 elements are undef.
10756 unsigned NumHalfSrcElts = NumElts / Scale;
10757 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10758 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10759 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10760 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10761 continue;
10762
10763 // The elements beyond the truncation must be undef/zero.
10764 unsigned UpperElts = NumElts - NumSrcElts;
10765 if (UpperElts > 0 &&
10766 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10767 continue;
10768 bool UndefUppers =
10769 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10770
10771 // As we're using both sources then we need to concat them together
10772 // and truncate from the double-sized src.
10773 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10774
10775 // For offset truncations, ensure that the concat is cheap.
10776 SDValue Src =
10777 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10778 if (!Src) {
10779 if (Offset)
10780 continue;
10781 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10782 }
10783
10784 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10785 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10786 Src = DAG.getBitcast(SrcVT, Src);
10787
10788 // Shift the offset'd elements into place for the truncation.
10789 // TODO: Use getTargetVShiftByConstNode.
10790 if (Offset)
10791 Src = DAG.getNode(
10792 X86ISD::VSRLI, DL, SrcVT, Src,
10793 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10794
10795 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10796 }
10797 }
10798
10799 return SDValue();
10800}
10801
10802/// Check whether a compaction lowering can be done by dropping even/odd
10803/// elements and compute how many times even/odd elements must be dropped.
10804///
10805/// This handles shuffles which take every Nth element where N is a power of
10806/// two. Example shuffle masks:
10807///
10808/// (even)
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10810/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10811/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10812/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10813/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10814/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10815///
10816/// (odd)
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10818/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10819///
10820/// Any of these lanes can of course be undef.
10821///
10822/// This routine only supports N <= 3.
10823/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10824/// for larger N.
10825///
10826/// \returns N above, or the number of times even/odd elements must be dropped
10827/// if there is such a number. Otherwise returns zero.
10828static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10829 bool IsSingleInput) {
10830 // The modulus for the shuffle vector entries is based on whether this is
10831 // a single input or not.
10832 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10833 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10834 "We should only be called with masks with a power-of-2 size!");
10835
10836 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10837 int Offset = MatchEven ? 0 : 1;
10838
10839 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10840 // and 2^3 simultaneously. This is because we may have ambiguity with
10841 // partially undef inputs.
10842 bool ViableForN[3] = {true, true, true};
10843
10844 for (int i = 0, e = Mask.size(); i < e; ++i) {
10845 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10846 // want.
10847 if (Mask[i] < 0)
10848 continue;
10849
10850 bool IsAnyViable = false;
10851 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10852 if (ViableForN[j]) {
10853 uint64_t N = j + 1;
10854
10855 // The shuffle mask must be equal to (i * 2^N) % M.
10856 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10857 IsAnyViable = true;
10858 else
10859 ViableForN[j] = false;
10860 }
10861 // Early exit if we exhaust the possible powers of two.
10862 if (!IsAnyViable)
10863 break;
10864 }
10865
10866 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10867 if (ViableForN[j])
10868 return j + 1;
10869
10870 // Return 0 as there is no viable power of two.
10871 return 0;
10872}
10873
10874// X86 has dedicated pack instructions that can handle specific truncation
10875// operations: PACKSS and PACKUS.
10876// Checks for compaction shuffle masks if MaxStages > 1.
10877// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10878static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10879 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10880 const SelectionDAG &DAG,
10881 const X86Subtarget &Subtarget,
10882 unsigned MaxStages = 1) {
10883 unsigned NumElts = VT.getVectorNumElements();
10884 unsigned BitSize = VT.getScalarSizeInBits();
10885 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10886 "Illegal maximum compaction");
10887
10888 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10889 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10890 unsigned NumPackedBits = NumSrcBits - BitSize;
10891 N1 = peekThroughBitcasts(N1);
10892 N2 = peekThroughBitcasts(N2);
10893 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10894 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10895 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10896 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10897 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10898 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10899 return false;
10900 if (Subtarget.hasSSE41() || BitSize == 8) {
10901 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10902 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10903 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10904 V1 = N1;
10905 V2 = N2;
10906 SrcVT = PackVT;
10907 PackOpcode = X86ISD::PACKUS;
10908 return true;
10909 }
10910 }
10911 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10912 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10913 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10914 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10915 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10916 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10917 V1 = N1;
10918 V2 = N2;
10919 SrcVT = PackVT;
10920 PackOpcode = X86ISD::PACKSS;
10921 return true;
10922 }
10923 return false;
10924 };
10925
10926 // Attempt to match against wider and wider compaction patterns.
10927 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10928 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10929 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10930
10931 // Try binary shuffle.
10932 SmallVector<int, 32> BinaryMask;
10933 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10934 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10935 if (MatchPACK(V1, V2, PackVT))
10936 return true;
10937
10938 // Try unary shuffle.
10939 SmallVector<int, 32> UnaryMask;
10940 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10941 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10942 if (MatchPACK(V1, V1, PackVT))
10943 return true;
10944 }
10945
10946 return false;
10947}
10948
10950 SDValue V2, ArrayRef<int> Mask,
10951 const X86Subtarget &Subtarget,
10952 SelectionDAG &DAG) {
10953 MVT PackVT;
10954 unsigned PackOpcode;
10955 unsigned SizeBits = VT.getSizeInBits();
10956 unsigned EltBits = VT.getScalarSizeInBits();
10957 unsigned MaxStages = Log2_32(64 / EltBits);
10958 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10959 Subtarget, MaxStages))
10960 return SDValue();
10961
10962 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10963 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10964
10965 // Don't lower multi-stage packs on AVX512, truncation is better.
10966 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10967 return SDValue();
10968
10969 // Pack to the largest type possible:
10970 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10971 unsigned MaxPackBits = 16;
10972 if (CurrentEltBits > 16 &&
10973 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10974 MaxPackBits = 32;
10975
10976 // Repeatedly pack down to the target size.
10977 SDValue Res;
10978 for (unsigned i = 0; i != NumStages; ++i) {
10979 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10980 unsigned NumSrcElts = SizeBits / SrcEltBits;
10981 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10982 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10983 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10984 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10985 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10986 DAG.getBitcast(SrcVT, V2));
10987 V1 = V2 = Res;
10988 CurrentEltBits /= 2;
10989 }
10990 assert(Res && Res.getValueType() == VT &&
10991 "Failed to lower compaction shuffle");
10992 return Res;
10993}
10994
10995/// Try to emit a bitmask instruction for a shuffle.
10996///
10997/// This handles cases where we can model a blend exactly as a bitmask due to
10998/// one of the inputs being zeroable.
11000 SDValue V2, ArrayRef<int> Mask,
11001 const APInt &Zeroable,
11002 const X86Subtarget &Subtarget,
11003 SelectionDAG &DAG) {
11004 MVT MaskVT = VT;
11005 MVT EltVT = VT.getVectorElementType();
11006 SDValue Zero, AllOnes;
11007 // Use f64 if i64 isn't legal.
11008 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11009 EltVT = MVT::f64;
11010 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11011 }
11012
11013 MVT LogicVT = VT;
11014 if (EltVT.isFloatingPoint()) {
11015 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11016 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11017 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11018 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11019 } else {
11020 Zero = DAG.getConstant(0, DL, EltVT);
11021 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11022 }
11023
11024 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11025 SDValue V;
11026 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11027 if (Zeroable[i])
11028 continue;
11029 if (Mask[i] % Size != i)
11030 return SDValue(); // Not a blend.
11031 if (!V)
11032 V = Mask[i] < Size ? V1 : V2;
11033 else if (V != (Mask[i] < Size ? V1 : V2))
11034 return SDValue(); // Can only let one input through the mask.
11035
11036 VMaskOps[i] = AllOnes;
11037 }
11038 if (!V)
11039 return SDValue(); // No non-zeroable elements!
11040
11041 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11042 VMask = DAG.getBitcast(LogicVT, VMask);
11043 V = DAG.getBitcast(LogicVT, V);
11044 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11045 return DAG.getBitcast(VT, And);
11046}
11047
11048/// Try to emit a blend instruction for a shuffle using bit math.
11049///
11050/// This is used as a fallback approach when first class blend instructions are
11051/// unavailable. Currently it is only suitable for integer vectors, but could
11052/// be generalized for floating point vectors if desirable.
11054 SDValue V2, ArrayRef<int> Mask,
11055 SelectionDAG &DAG) {
11056 assert(VT.isInteger() && "Only supports integer vector types!");
11057 MVT EltVT = VT.getVectorElementType();
11058 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11059 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11061 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11062 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11063 return SDValue(); // Shuffled input!
11064 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11065 }
11066
11067 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11068 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11069}
11070
11072 SDValue PreservedSrc,
11073 const X86Subtarget &Subtarget,
11074 SelectionDAG &DAG);
11075
11078 const APInt &Zeroable, bool &ForceV1Zero,
11079 bool &ForceV2Zero, uint64_t &BlendMask) {
11080 bool V1IsZeroOrUndef =
11082 bool V2IsZeroOrUndef =
11084
11085 BlendMask = 0;
11086 ForceV1Zero = false, ForceV2Zero = false;
11087 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11088
11089 int NumElts = Mask.size();
11090 int NumLanes = VT.getSizeInBits() / 128;
11091 int NumEltsPerLane = NumElts / NumLanes;
11092 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11093
11094 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11095 // then ensure the blend mask part for that lane just references that input.
11096 bool ForceWholeLaneMasks =
11097 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11098
11099 // Attempt to generate the binary blend mask. If an input is zero then
11100 // we can use any lane.
11101 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11102 // Keep track of the inputs used per lane.
11103 bool LaneV1InUse = false;
11104 bool LaneV2InUse = false;
11105 uint64_t LaneBlendMask = 0;
11106 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11107 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11108 int M = Mask[Elt];
11109 if (M == SM_SentinelUndef)
11110 continue;
11111 if (M == Elt || (0 <= M && M < NumElts &&
11112 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11113 Mask[Elt] = Elt;
11114 LaneV1InUse = true;
11115 continue;
11116 }
11117 if (M == (Elt + NumElts) ||
11118 (NumElts <= M &&
11119 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 if (Zeroable[Elt]) {
11126 if (V1IsZeroOrUndef) {
11127 ForceV1Zero = true;
11128 Mask[Elt] = Elt;
11129 LaneV1InUse = true;
11130 continue;
11131 }
11132 if (V2IsZeroOrUndef) {
11133 ForceV2Zero = true;
11134 LaneBlendMask |= 1ull << LaneElt;
11135 Mask[Elt] = Elt + NumElts;
11136 LaneV2InUse = true;
11137 continue;
11138 }
11139 }
11140 return false;
11141 }
11142
11143 // If we only used V2 then splat the lane blend mask to avoid any demanded
11144 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11145 // blend mask bit).
11146 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11147 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11148
11149 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11150 }
11151 return true;
11152}
11153
11154/// Try to emit a blend instruction for a shuffle.
11155///
11156/// This doesn't do any checks for the availability of instructions for blending
11157/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11158/// be matched in the backend with the type given. What it does check for is
11159/// that the shuffle mask is a blend, or convertible into a blend with zero.
11161 SDValue V2, ArrayRef<int> Original,
11162 const APInt &Zeroable,
11163 const X86Subtarget &Subtarget,
11164 SelectionDAG &DAG) {
11165 uint64_t BlendMask = 0;
11166 bool ForceV1Zero = false, ForceV2Zero = false;
11167 SmallVector<int, 64> Mask(Original);
11168 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11169 BlendMask))
11170 return SDValue();
11171
11172 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11173 if (ForceV1Zero)
11174 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11175 if (ForceV2Zero)
11176 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11177
11178 unsigned NumElts = VT.getVectorNumElements();
11179
11180 switch (VT.SimpleTy) {
11181 case MVT::v4i64:
11182 case MVT::v8i32:
11183 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11184 [[fallthrough]];
11185 case MVT::v4f64:
11186 case MVT::v8f32:
11187 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11188 [[fallthrough]];
11189 case MVT::v2f64:
11190 case MVT::v2i64:
11191 case MVT::v4f32:
11192 case MVT::v4i32:
11193 case MVT::v8i16:
11194 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11195 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11196 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11197 case MVT::v16i16: {
11198 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11199 SmallVector<int, 8> RepeatedMask;
11200 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11201 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11202 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11203 BlendMask = 0;
11204 for (int i = 0; i < 8; ++i)
11205 if (RepeatedMask[i] >= 8)
11206 BlendMask |= 1ull << i;
11207 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11208 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11209 }
11210 // Use PBLENDW for lower/upper lanes and then blend lanes.
11211 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11212 // merge to VSELECT where useful.
11213 uint64_t LoMask = BlendMask & 0xFF;
11214 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11215 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11216 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11217 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11218 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11219 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11220 return DAG.getVectorShuffle(
11221 MVT::v16i16, DL, Lo, Hi,
11222 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11223 }
11224 [[fallthrough]];
11225 }
11226 case MVT::v32i8:
11227 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11228 [[fallthrough]];
11229 case MVT::v16i8: {
11230 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11231
11232 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11233 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11234 Subtarget, DAG))
11235 return Masked;
11236
11237 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11238 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11239 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11240 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11241 }
11242
11243 // If we have VPTERNLOG, we can use that as a bit blend.
11244 if (Subtarget.hasVLX())
11245 if (SDValue BitBlend =
11246 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11247 return BitBlend;
11248
11249 // Scale the blend by the number of bytes per element.
11250 int Scale = VT.getScalarSizeInBits() / 8;
11251
11252 // This form of blend is always done on bytes. Compute the byte vector
11253 // type.
11254 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11255
11256 // x86 allows load folding with blendvb from the 2nd source operand. But
11257 // we are still using LLVM select here (see comment below), so that's V1.
11258 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11259 // allow that load-folding possibility.
11260 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11262 std::swap(V1, V2);
11263 }
11264
11265 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11266 // mix of LLVM's code generator and the x86 backend. We tell the code
11267 // generator that boolean values in the elements of an x86 vector register
11268 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11269 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11270 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11271 // of the element (the remaining are ignored) and 0 in that high bit would
11272 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11273 // the LLVM model for boolean values in vector elements gets the relevant
11274 // bit set, it is set backwards and over constrained relative to x86's
11275 // actual model.
11276 SmallVector<SDValue, 32> VSELECTMask;
11277 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11278 for (int j = 0; j < Scale; ++j)
11279 VSELECTMask.push_back(
11280 Mask[i] < 0
11281 ? DAG.getUNDEF(MVT::i8)
11282 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11283
11284 V1 = DAG.getBitcast(BlendVT, V1);
11285 V2 = DAG.getBitcast(BlendVT, V2);
11286 return DAG.getBitcast(
11287 VT,
11288 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11289 V1, V2));
11290 }
11291 case MVT::v16f32:
11292 case MVT::v8f64:
11293 case MVT::v8i64:
11294 case MVT::v16i32:
11295 case MVT::v32i16:
11296 case MVT::v64i8: {
11297 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11298 bool OptForSize = DAG.shouldOptForSize();
11299 if (!OptForSize) {
11300 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11301 Subtarget, DAG))
11302 return Masked;
11303 }
11304
11305 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11306 // masked move.
11307 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11308 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11309 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11310 }
11311 default:
11312 llvm_unreachable("Not a supported integer vector type!");
11313 }
11314}
11315
11316/// Try to lower as a blend of elements from two inputs followed by
11317/// a single-input permutation.
11318///
11319/// This matches the pattern where we can blend elements from two inputs and
11320/// then reduce the shuffle to a single-input permutation.
11322 SDValue V1, SDValue V2,
11323 ArrayRef<int> Mask,
11324 SelectionDAG &DAG,
11325 bool ImmBlends = false) {
11326 // We build up the blend mask while checking whether a blend is a viable way
11327 // to reduce the shuffle.
11328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11330
11331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11332 if (Mask[i] < 0)
11333 continue;
11334
11335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11336
11337 if (BlendMask[Mask[i] % Size] < 0)
11338 BlendMask[Mask[i] % Size] = Mask[i];
11339 else if (BlendMask[Mask[i] % Size] != Mask[i])
11340 return SDValue(); // Can't blend in the needed input!
11341
11342 PermuteMask[i] = Mask[i] % Size;
11343 }
11344
11345 // If only immediate blends, then bail if the blend mask can't be widened to
11346 // i16.
11347 unsigned EltSize = VT.getScalarSizeInBits();
11348 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11349 return SDValue();
11350
11351 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11352 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11353}
11354
11355/// Try to lower as an unpack of elements from two inputs followed by
11356/// a single-input permutation.
11357///
11358/// This matches the pattern where we can unpack elements from two inputs and
11359/// then reduce the shuffle to a single-input (wider) permutation.
11361 SDValue V1, SDValue V2,
11362 ArrayRef<int> Mask,
11363 SelectionDAG &DAG) {
11364 int NumElts = Mask.size();
11365 int NumLanes = VT.getSizeInBits() / 128;
11366 int NumLaneElts = NumElts / NumLanes;
11367 int NumHalfLaneElts = NumLaneElts / 2;
11368
11369 bool MatchLo = true, MatchHi = true;
11370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11371
11372 // Determine UNPCKL/UNPCKH type and operand order.
11373 for (int Elt = 0; Elt != NumElts; ++Elt) {
11374 int M = Mask[Elt];
11375 if (M < 0)
11376 continue;
11377
11378 // Normalize the mask value depending on whether it's V1 or V2.
11379 int NormM = M;
11380 SDValue &Op = Ops[Elt & 1];
11381 if (M < NumElts && (Op.isUndef() || Op == V1))
11382 Op = V1;
11383 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11384 Op = V2;
11385 NormM -= NumElts;
11386 } else
11387 return SDValue();
11388
11389 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11390 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11391 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11392 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11393 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11394 if (MatchLoAnyLane || MatchHiAnyLane) {
11395 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11396 "Failed to match UNPCKLO/UNPCKHI");
11397 break;
11398 }
11399 }
11400 MatchLo &= MatchLoAnyLane;
11401 MatchHi &= MatchHiAnyLane;
11402 if (!MatchLo && !MatchHi)
11403 return SDValue();
11404 }
11405 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11406
11407 // Element indices have changed after unpacking. Calculate permute mask
11408 // so that they will be put back to the position as dictated by the
11409 // original shuffle mask indices.
11410 SmallVector<int, 32> PermuteMask(NumElts, -1);
11411 for (int Elt = 0; Elt != NumElts; ++Elt) {
11412 int M = Mask[Elt];
11413 if (M < 0)
11414 continue;
11415 int NormM = M;
11416 if (NumElts <= M)
11417 NormM -= NumElts;
11418 bool IsFirstOp = M < NumElts;
11419 int BaseMaskElt =
11420 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11421 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11422 PermuteMask[Elt] = BaseMaskElt;
11423 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11424 PermuteMask[Elt] = BaseMaskElt + 1;
11425 assert(PermuteMask[Elt] != -1 &&
11426 "Input mask element is defined but failed to assign permute mask");
11427 }
11428
11429 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11430 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11431 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11432}
11433
11434/// Try to lower a shuffle as a permute of the inputs followed by an
11435/// UNPCK instruction.
11436///
11437/// This specifically targets cases where we end up with alternating between
11438/// the two inputs, and so can permute them into something that feeds a single
11439/// UNPCK instruction. Note that this routine only targets integer vectors
11440/// because for floating point vectors we have a generalized SHUFPS lowering
11441/// strategy that handles everything that doesn't *exactly* match an unpack,
11442/// making this clever lowering unnecessary.
11444 SDValue V1, SDValue V2,
11445 ArrayRef<int> Mask,
11446 const X86Subtarget &Subtarget,
11447 SelectionDAG &DAG) {
11448 int Size = Mask.size();
11449 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11450
11451 // This routine only supports 128-bit integer dual input vectors.
11452 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11453 return SDValue();
11454
11455 int NumLoInputs =
11456 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11457 int NumHiInputs =
11458 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11459
11460 bool UnpackLo = NumLoInputs >= NumHiInputs;
11461
11462 auto TryUnpack = [&](int ScalarSize, int Scale) {
11463 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11464 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11465
11466 for (int i = 0; i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 // Each element of the unpack contains Scale elements from this mask.
11471 int UnpackIdx = i / Scale;
11472
11473 // We only handle the case where V1 feeds the first slots of the unpack.
11474 // We rely on canonicalization to ensure this is the case.
11475 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11476 return SDValue();
11477
11478 // Setup the mask for this input. The indexing is tricky as we have to
11479 // handle the unpack stride.
11480 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11481 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11482 Mask[i] % Size;
11483 }
11484
11485 // If we will have to shuffle both inputs to use the unpack, check whether
11486 // we can just unpack first and shuffle the result. If so, skip this unpack.
11487 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11488 !isNoopShuffleMask(V2Mask))
11489 return SDValue();
11490
11491 // Shuffle the inputs into place.
11492 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11493 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11494
11495 // Cast the inputs to the type we will use to unpack them.
11496 MVT UnpackVT =
11497 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11498 V1 = DAG.getBitcast(UnpackVT, V1);
11499 V2 = DAG.getBitcast(UnpackVT, V2);
11500
11501 // Unpack the inputs and cast the result back to the desired type.
11502 return DAG.getBitcast(
11503 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11504 UnpackVT, V1, V2));
11505 };
11506
11507 // We try each unpack from the largest to the smallest to try and find one
11508 // that fits this mask.
11509 int OrigScalarSize = VT.getScalarSizeInBits();
11510 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11511 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11512 return Unpack;
11513
11514 // If we're shuffling with a zero vector then we're better off not doing
11515 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11518 return SDValue();
11519
11520 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11521 // initial unpack.
11522 if (NumLoInputs == 0 || NumHiInputs == 0) {
11523 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11524 "We have to have *some* inputs!");
11525 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11526
11527 // FIXME: We could consider the total complexity of the permute of each
11528 // possible unpacking. Or at the least we should consider how many
11529 // half-crossings are created.
11530 // FIXME: We could consider commuting the unpacks.
11531
11532 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11533 for (int i = 0; i < Size; ++i) {
11534 if (Mask[i] < 0)
11535 continue;
11536
11537 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11538
11539 PermMask[i] =
11540 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11541 }
11542 return DAG.getVectorShuffle(
11543 VT, DL,
11544 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11545 V1, V2),
11546 DAG.getUNDEF(VT), PermMask);
11547 }
11548
11549 return SDValue();
11550}
11551
11552/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11553/// permuting the elements of the result in place.
11555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11557 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11558 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11559 (VT.is512BitVector() && !Subtarget.hasBWI()))
11560 return SDValue();
11561
11562 // We don't currently support lane crossing permutes.
11563 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11564 return SDValue();
11565
11566 int Scale = VT.getScalarSizeInBits() / 8;
11567 int NumLanes = VT.getSizeInBits() / 128;
11568 int NumElts = VT.getVectorNumElements();
11569 int NumEltsPerLane = NumElts / NumLanes;
11570
11571 // Determine range of mask elts.
11572 bool Blend1 = true;
11573 bool Blend2 = true;
11574 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11575 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11576 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11577 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11578 int M = Mask[Lane + Elt];
11579 if (M < 0)
11580 continue;
11581 if (M < NumElts) {
11582 Blend1 &= (M == (Lane + Elt));
11583 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11584 M = M % NumEltsPerLane;
11585 Range1.first = std::min(Range1.first, M);
11586 Range1.second = std::max(Range1.second, M);
11587 } else {
11588 M -= NumElts;
11589 Blend2 &= (M == (Lane + Elt));
11590 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11591 M = M % NumEltsPerLane;
11592 Range2.first = std::min(Range2.first, M);
11593 Range2.second = std::max(Range2.second, M);
11594 }
11595 }
11596 }
11597
11598 // Bail if we don't need both elements.
11599 // TODO - it might be worth doing this for unary shuffles if the permute
11600 // can be widened.
11601 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11602 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11603 return SDValue();
11604
11605 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11606 return SDValue();
11607
11608 // Rotate the 2 ops so we can access both ranges, then permute the result.
11609 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11610 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11611 SDValue Rotate = DAG.getBitcast(
11612 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11613 DAG.getBitcast(ByteVT, Lo),
11614 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11615 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11616 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11617 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11618 int M = Mask[Lane + Elt];
11619 if (M < 0)
11620 continue;
11621 if (M < NumElts)
11622 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11623 else
11624 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11625 }
11626 }
11627 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11628 };
11629
11630 // Check if the ranges are small enough to rotate from either direction.
11631 if (Range2.second < Range1.first)
11632 return RotateAndPermute(V1, V2, Range1.first, 0);
11633 if (Range1.second < Range2.first)
11634 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11635 return SDValue();
11636}
11637
11639 return isUndefOrEqual(Mask, 0);
11640}
11641
11643 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11644}
11645
11646/// Check if the Mask consists of the same element repeated multiple times.
11648 size_t NumUndefs = 0;
11649 std::optional<int> UniqueElt;
11650 for (int Elt : Mask) {
11651 if (Elt == SM_SentinelUndef) {
11652 NumUndefs++;
11653 continue;
11654 }
11655 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11656 return false;
11657 UniqueElt = Elt;
11658 }
11659 // Make sure the element is repeated enough times by checking the number of
11660 // undefs is small.
11661 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11662}
11663
11664/// Generic routine to decompose a shuffle and blend into independent
11665/// blends and permutes.
11666///
11667/// This matches the extremely common pattern for handling combined
11668/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11669/// operations. It will try to pick the best arrangement of shuffles and
11670/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11672 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11673 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11674 int NumElts = Mask.size();
11675 int NumLanes = VT.getSizeInBits() / 128;
11676 int NumEltsPerLane = NumElts / NumLanes;
11677
11678 // Shuffle the input elements into the desired positions in V1 and V2 and
11679 // unpack/blend them together.
11680 bool IsAlternating = true;
11681 bool V1Zero = true, V2Zero = true;
11682 SmallVector<int, 32> V1Mask(NumElts, -1);
11683 SmallVector<int, 32> V2Mask(NumElts, -1);
11684 SmallVector<int, 32> FinalMask(NumElts, -1);
11685 for (int i = 0; i < NumElts; ++i) {
11686 int M = Mask[i];
11687 if (M >= 0 && M < NumElts) {
11688 V1Mask[i] = M;
11689 FinalMask[i] = i;
11690 V1Zero &= Zeroable[i];
11691 IsAlternating &= (i & 1) == 0;
11692 } else if (M >= NumElts) {
11693 V2Mask[i] = M - NumElts;
11694 FinalMask[i] = i + NumElts;
11695 V2Zero &= Zeroable[i];
11696 IsAlternating &= (i & 1) == 1;
11697 }
11698 }
11699
11700 // If we effectively only demand the 0'th element of \p Input, and not only
11701 // as 0'th element, then broadcast said input,
11702 // and change \p InputMask to be a no-op (identity) mask.
11703 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11704 &DAG](SDValue &Input,
11705 MutableArrayRef<int> InputMask) {
11706 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11707 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11708 !X86::mayFoldLoad(Input, Subtarget)))
11709 return;
11710 if (isNoopShuffleMask(InputMask))
11711 return;
11712 assert(isBroadcastShuffleMask(InputMask) &&
11713 "Expected to demand only the 0'th element.");
11715 for (auto I : enumerate(InputMask)) {
11716 int &InputMaskElt = I.value();
11717 if (InputMaskElt >= 0)
11718 InputMaskElt = I.index();
11719 }
11720 };
11721
11722 // Currently, we may need to produce one shuffle per input, and blend results.
11723 // It is possible that the shuffle for one of the inputs is already a no-op.
11724 // See if we can simplify non-no-op shuffles into broadcasts,
11725 // which we consider to be strictly better than an arbitrary shuffle.
11726 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11728 canonicalizeBroadcastableInput(V1, V1Mask);
11729 canonicalizeBroadcastableInput(V2, V2Mask);
11730 }
11731
11732 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11733 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11734 // the shuffle may be able to fold with a load or other benefit. However, when
11735 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11736 // pre-shuffle first is a better strategy.
11737 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11738 // If we don't have blends, see if we can create a cheap unpack.
11739 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11740 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11741 is128BitUnpackShuffleMask(V2Mask, DAG)))
11742 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11743 DL, VT, V1, V2, Mask, Subtarget, DAG))
11744 return PermUnpack;
11745
11746 // Only prefer immediate blends to unpack/rotate.
11747 if (SDValue BlendPerm =
11748 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11749 return BlendPerm;
11750
11751 // If either input vector provides only a single element which is repeated
11752 // multiple times, unpacking from both input vectors would generate worse
11753 // code. e.g. for
11754 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11755 // it is better to process t4 first to create a vector of t4[0], then unpack
11756 // that vector with t2.
11757 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11759 if (SDValue UnpackPerm =
11760 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11761 return UnpackPerm;
11762
11764 DL, VT, V1, V2, Mask, Subtarget, DAG))
11765 return RotatePerm;
11766
11767 // Unpack/rotate failed - try again with variable blends.
11768 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11769 DAG))
11770 return BlendPerm;
11771
11772 if (VT.getScalarSizeInBits() >= 32)
11773 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11774 DL, VT, V1, V2, Mask, Subtarget, DAG))
11775 return PermUnpack;
11776 }
11777
11778 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11779 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11780 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11781 // than half the elements coming from each source.
11782 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11783 V1Mask.assign(NumElts, -1);
11784 V2Mask.assign(NumElts, -1);
11785 FinalMask.assign(NumElts, -1);
11786 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11787 for (int j = 0; j != NumEltsPerLane; ++j) {
11788 int M = Mask[i + j];
11789 if (M >= 0 && M < NumElts) {
11790 V1Mask[i + (j / 2)] = M;
11791 FinalMask[i + j] = i + (j / 2);
11792 } else if (M >= NumElts) {
11793 V2Mask[i + (j / 2)] = M - NumElts;
11794 FinalMask[i + j] = i + (j / 2) + NumElts;
11795 }
11796 }
11797 }
11798
11799 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11801 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11802}
11803
11804static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11805 const X86Subtarget &Subtarget,
11806 ArrayRef<int> Mask) {
11807 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11808 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11809
11810 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11811 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11812 int MaxSubElts = 64 / EltSizeInBits;
11813 unsigned RotateAmt, NumSubElts;
11814 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11815 MaxSubElts, NumSubElts, RotateAmt))
11816 return -1;
11817 unsigned NumElts = Mask.size();
11818 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11819 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11820 return RotateAmt;
11821}
11822
11823/// Lower shuffle using X86ISD::VROTLI rotations.
11825 ArrayRef<int> Mask,
11826 const X86Subtarget &Subtarget,
11827 SelectionDAG &DAG) {
11828 // Only XOP + AVX512 targets have bit rotation instructions.
11829 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11830 bool IsLegal =
11831 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11832 if (!IsLegal && Subtarget.hasSSE3())
11833 return SDValue();
11834
11835 MVT RotateVT;
11836 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11837 Subtarget, Mask);
11838 if (RotateAmt < 0)
11839 return SDValue();
11840
11841 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11842 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11843 // widen to vXi16 or more then existing lowering should will be better.
11844 if (!IsLegal) {
11845 if ((RotateAmt % 16) == 0)
11846 return SDValue();
11847 // TODO: Use getTargetVShiftByConstNode.
11848 unsigned ShlAmt = RotateAmt;
11849 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11850 V1 = DAG.getBitcast(RotateVT, V1);
11851 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11852 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11853 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11854 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11855 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11856 return DAG.getBitcast(VT, Rot);
11857 }
11858
11859 SDValue Rot =
11860 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11861 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11862 return DAG.getBitcast(VT, Rot);
11863}
11864
11865/// Try to match a vector shuffle as an element rotation.
11866///
11867/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11869 ArrayRef<int> Mask) {
11870 int NumElts = Mask.size();
11871
11872 // We need to detect various ways of spelling a rotation:
11873 // [11, 12, 13, 14, 15, 0, 1, 2]
11874 // [-1, 12, 13, 14, -1, -1, 1, -1]
11875 // [-1, -1, -1, -1, -1, -1, 1, 2]
11876 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11877 // [-1, 4, 5, 6, -1, -1, 9, -1]
11878 // [-1, 4, 5, 6, -1, -1, -1, -1]
11879 int Rotation = 0;
11880 SDValue Lo, Hi;
11881 for (int i = 0; i < NumElts; ++i) {
11882 int M = Mask[i];
11883 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11884 "Unexpected mask index.");
11885 if (M < 0)
11886 continue;
11887
11888 // Determine where a rotated vector would have started.
11889 int StartIdx = i - (M % NumElts);
11890 if (StartIdx == 0)
11891 // The identity rotation isn't interesting, stop.
11892 return -1;
11893
11894 // If we found the tail of a vector the rotation must be the missing
11895 // front. If we found the head of a vector, it must be how much of the
11896 // head.
11897 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11898
11899 if (Rotation == 0)
11900 Rotation = CandidateRotation;
11901 else if (Rotation != CandidateRotation)
11902 // The rotations don't match, so we can't match this mask.
11903 return -1;
11904
11905 // Compute which value this mask is pointing at.
11906 SDValue MaskV = M < NumElts ? V1 : V2;
11907
11908 // Compute which of the two target values this index should be assigned
11909 // to. This reflects whether the high elements are remaining or the low
11910 // elements are remaining.
11911 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11912
11913 // Either set up this value if we've not encountered it before, or check
11914 // that it remains consistent.
11915 if (!TargetV)
11916 TargetV = MaskV;
11917 else if (TargetV != MaskV)
11918 // This may be a rotation, but it pulls from the inputs in some
11919 // unsupported interleaving.
11920 return -1;
11921 }
11922
11923 // Check that we successfully analyzed the mask, and normalize the results.
11924 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11925 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11926 if (!Lo)
11927 Lo = Hi;
11928 else if (!Hi)
11929 Hi = Lo;
11930
11931 V1 = Lo;
11932 V2 = Hi;
11933
11934 return Rotation;
11935}
11936
11937/// Try to lower a vector shuffle as a byte rotation.
11938///
11939/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11940/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11941/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11942/// try to generically lower a vector shuffle through such an pattern. It
11943/// does not check for the profitability of lowering either as PALIGNR or
11944/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11945/// This matches shuffle vectors that look like:
11946///
11947/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11948///
11949/// Essentially it concatenates V1 and V2, shifts right by some number of
11950/// elements, and takes the low elements as the result. Note that while this is
11951/// specified as a *right shift* because x86 is little-endian, it is a *left
11952/// rotate* of the vector lanes.
11954 ArrayRef<int> Mask) {
11955 // Don't accept any shuffles with zero elements.
11956 if (isAnyZero(Mask))
11957 return -1;
11958
11959 // PALIGNR works on 128-bit lanes.
11960 SmallVector<int, 16> RepeatedMask;
11961 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11962 return -1;
11963
11964 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11965 if (Rotation <= 0)
11966 return -1;
11967
11968 // PALIGNR rotates bytes, so we need to scale the
11969 // rotation based on how many bytes are in the vector lane.
11970 int NumElts = RepeatedMask.size();
11971 int Scale = 16 / NumElts;
11972 return Rotation * Scale;
11973}
11974
11976 SDValue V2, ArrayRef<int> Mask,
11977 const X86Subtarget &Subtarget,
11978 SelectionDAG &DAG) {
11979 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11980
11981 SDValue Lo = V1, Hi = V2;
11982 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11983 if (ByteRotation <= 0)
11984 return SDValue();
11985
11986 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11987 // PSLLDQ/PSRLDQ.
11988 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11989 Lo = DAG.getBitcast(ByteVT, Lo);
11990 Hi = DAG.getBitcast(ByteVT, Hi);
11991
11992 // SSSE3 targets can use the palignr instruction.
11993 if (Subtarget.hasSSSE3()) {
11994 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11995 "512-bit PALIGNR requires BWI instructions");
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11998 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11999 }
12000
12001 assert(VT.is128BitVector() &&
12002 "Rotate-based lowering only supports 128-bit lowering!");
12003 assert(Mask.size() <= 16 &&
12004 "Can shuffle at most 16 bytes in a 128-bit vector!");
12005 assert(ByteVT == MVT::v16i8 &&
12006 "SSE2 rotate lowering only needed for v16i8!");
12007
12008 // Default SSE2 implementation
12009 int LoByteShift = 16 - ByteRotation;
12010 int HiByteShift = ByteRotation;
12011
12012 SDValue LoShift =
12013 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12014 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12015 SDValue HiShift =
12016 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12017 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12018 return DAG.getBitcast(VT,
12019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12020}
12021
12022/// Try to lower a vector shuffle as a dword/qword rotation.
12023///
12024/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12025/// rotation of the concatenation of two vectors; This routine will
12026/// try to generically lower a vector shuffle through such an pattern.
12027///
12028/// Essentially it concatenates V1 and V2, shifts right by some number of
12029/// elements, and takes the low elements as the result. Note that while this is
12030/// specified as a *right shift* because x86 is little-endian, it is a *left
12031/// rotate* of the vector lanes.
12033 SDValue V2, ArrayRef<int> Mask,
12034 const APInt &Zeroable,
12035 const X86Subtarget &Subtarget,
12036 SelectionDAG &DAG) {
12037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12038 "Only 32-bit and 64-bit elements are supported!");
12039
12040 // 128/256-bit vectors are only supported with VLX.
12041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12042 && "VLX required for 128/256-bit vectors");
12043
12044 SDValue Lo = V1, Hi = V2;
12045 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12046 if (0 < Rotation)
12047 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12048 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12049
12050 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12051 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12052 // TODO: We can probably make this more aggressive and use shift-pairs like
12053 // lowerShuffleAsByteShiftMask.
12054 unsigned NumElts = Mask.size();
12055 unsigned ZeroLo = Zeroable.countr_one();
12056 unsigned ZeroHi = Zeroable.countl_one();
12057 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12058 if (!ZeroLo && !ZeroHi)
12059 return SDValue();
12060
12061 if (ZeroLo) {
12062 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12063 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12064 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12065 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12066 getZeroVector(VT, Subtarget, DAG, DL),
12067 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12068 }
12069
12070 if (ZeroHi) {
12071 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12072 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12073 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12074 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12075 getZeroVector(VT, Subtarget, DAG, DL), Src,
12076 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12077 }
12078
12079 return SDValue();
12080}
12081
12082/// Try to lower a vector shuffle as a byte shift sequence.
12084 SDValue V2, ArrayRef<int> Mask,
12085 const APInt &Zeroable,
12086 const X86Subtarget &Subtarget,
12087 SelectionDAG &DAG) {
12088 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12089 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12090
12091 // We need a shuffle that has zeros at one/both ends and a sequential
12092 // shuffle from one source within.
12093 unsigned ZeroLo = Zeroable.countr_one();
12094 unsigned ZeroHi = Zeroable.countl_one();
12095 if (!ZeroLo && !ZeroHi)
12096 return SDValue();
12097
12098 unsigned NumElts = Mask.size();
12099 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12100 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12101 return SDValue();
12102
12103 unsigned Scale = VT.getScalarSizeInBits() / 8;
12104 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12105 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12106 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12107 return SDValue();
12108
12109 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12110 Res = DAG.getBitcast(MVT::v16i8, Res);
12111
12112 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12113 // inner sequential set of elements, possibly offset:
12114 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12115 // 01234567 --> 4567zzzz --> zzzzz456
12116 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12117 if (ZeroLo == 0) {
12118 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12119 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12120 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12123 } else if (ZeroHi == 0) {
12124 unsigned Shift = Mask[ZeroLo] % NumElts;
12125 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12129 } else if (!Subtarget.hasSSSE3()) {
12130 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12131 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12132 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12133 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12134 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Shift += Mask[ZeroLo] % NumElts;
12137 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12138 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12139 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12140 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12141 } else
12142 return SDValue();
12143
12144 return DAG.getBitcast(VT, Res);
12145}
12146
12147/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12148///
12149/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12150/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12151/// matches elements from one of the input vectors shuffled to the left or
12152/// right with zeroable elements 'shifted in'. It handles both the strictly
12153/// bit-wise element shifts and the byte shift across an entire 128-bit double
12154/// quad word lane.
12155///
12156/// PSHL : (little-endian) left bit shift.
12157/// [ zz, 0, zz, 2 ]
12158/// [ -1, 4, zz, -1 ]
12159/// PSRL : (little-endian) right bit shift.
12160/// [ 1, zz, 3, zz]
12161/// [ -1, -1, 7, zz]
12162/// PSLLDQ : (little-endian) left byte shift
12163/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12164/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12165/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12166/// PSRLDQ : (little-endian) right byte shift
12167/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12168/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12169/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12170static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12171 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12172 int MaskOffset, const APInt &Zeroable,
12173 const X86Subtarget &Subtarget) {
12174 int Size = Mask.size();
12175 unsigned SizeInBits = Size * ScalarSizeInBits;
12176
12177 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12178 for (int i = 0; i < Size; i += Scale)
12179 for (int j = 0; j < Shift; ++j)
12180 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12181 return false;
12182
12183 return true;
12184 };
12185
12186 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12187 for (int i = 0; i != Size; i += Scale) {
12188 unsigned Pos = Left ? i + Shift : i;
12189 unsigned Low = Left ? i : i + Shift;
12190 unsigned Len = Scale - Shift;
12191 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12192 return -1;
12193 }
12194
12195 int ShiftEltBits = ScalarSizeInBits * Scale;
12196 bool ByteShift = ShiftEltBits > 64;
12197 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12198 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12199 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12200
12201 // Normalize the scale for byte shifts to still produce an i64 element
12202 // type.
12203 Scale = ByteShift ? Scale / 2 : Scale;
12204
12205 // We need to round trip through the appropriate type for the shift.
12206 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12207 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12208 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12209 return (int)ShiftAmt;
12210 };
12211
12212 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12213 // keep doubling the size of the integer elements up to that. We can
12214 // then shift the elements of the integer vector by whole multiples of
12215 // their width within the elements of the larger integer vector. Test each
12216 // multiple to see if we can find a match with the moved element indices
12217 // and that the shifted in elements are all zeroable.
12218 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12219 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12220 for (int Shift = 1; Shift != Scale; ++Shift)
12221 for (bool Left : {true, false})
12222 if (CheckZeros(Shift, Scale, Left)) {
12223 int ShiftAmt = MatchShift(Shift, Scale, Left);
12224 if (0 < ShiftAmt)
12225 return ShiftAmt;
12226 }
12227
12228 // no match
12229 return -1;
12230}
12231
12233 SDValue V2, ArrayRef<int> Mask,
12234 const APInt &Zeroable,
12235 const X86Subtarget &Subtarget,
12236 SelectionDAG &DAG, bool BitwiseOnly) {
12237 int Size = Mask.size();
12238 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12239
12240 MVT ShiftVT;
12241 SDValue V = V1;
12242 unsigned Opcode;
12243
12244 // Try to match shuffle against V1 shift.
12245 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12246 Mask, 0, Zeroable, Subtarget);
12247
12248 // If V1 failed, try to match shuffle against V2 shift.
12249 if (ShiftAmt < 0) {
12250 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12251 Mask, Size, Zeroable, Subtarget);
12252 V = V2;
12253 }
12254
12255 if (ShiftAmt < 0)
12256 return SDValue();
12257
12258 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12259 return SDValue();
12260
12261 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12262 "Illegal integer vector type");
12263 V = DAG.getBitcast(ShiftVT, V);
12264 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12265 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12266 return DAG.getBitcast(VT, V);
12267}
12268
12269// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12270// Remainder of lower half result is zero and upper half is all undef.
12271static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12272 ArrayRef<int> Mask, uint64_t &BitLen,
12273 uint64_t &BitIdx, const APInt &Zeroable) {
12274 int Size = Mask.size();
12275 int HalfSize = Size / 2;
12276 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12277 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12278
12279 // Upper half must be undefined.
12280 if (!isUndefUpperHalf(Mask))
12281 return false;
12282
12283 // Determine the extraction length from the part of the
12284 // lower half that isn't zeroable.
12285 int Len = HalfSize;
12286 for (; Len > 0; --Len)
12287 if (!Zeroable[Len - 1])
12288 break;
12289 assert(Len > 0 && "Zeroable shuffle mask");
12290
12291 // Attempt to match first Len sequential elements from the lower half.
12292 SDValue Src;
12293 int Idx = -1;
12294 for (int i = 0; i != Len; ++i) {
12295 int M = Mask[i];
12296 if (M == SM_SentinelUndef)
12297 continue;
12298 SDValue &V = (M < Size ? V1 : V2);
12299 M = M % Size;
12300
12301 // The extracted elements must start at a valid index and all mask
12302 // elements must be in the lower half.
12303 if (i > M || M >= HalfSize)
12304 return false;
12305
12306 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12307 Src = V;
12308 Idx = M - i;
12309 continue;
12310 }
12311 return false;
12312 }
12313
12314 if (!Src || Idx < 0)
12315 return false;
12316
12317 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12318 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12319 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12320 V1 = Src;
12321 return true;
12322}
12323
12324// INSERTQ: Extract lowest Len elements from lower half of second source and
12325// insert over first source, starting at Idx.
12326// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12327static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12328 ArrayRef<int> Mask, uint64_t &BitLen,
12329 uint64_t &BitIdx) {
12330 int Size = Mask.size();
12331 int HalfSize = Size / 2;
12332 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12333
12334 // Upper half must be undefined.
12335 if (!isUndefUpperHalf(Mask))
12336 return false;
12337
12338 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12339 SDValue Base;
12340
12341 // Attempt to match first source from mask before insertion point.
12342 if (isUndefInRange(Mask, 0, Idx)) {
12343 /* EMPTY */
12344 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12345 Base = V1;
12346 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12347 Base = V2;
12348 } else {
12349 continue;
12350 }
12351
12352 // Extend the extraction length looking to match both the insertion of
12353 // the second source and the remaining elements of the first.
12354 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12355 SDValue Insert;
12356 int Len = Hi - Idx;
12357
12358 // Match insertion.
12359 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12360 Insert = V1;
12361 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12362 Insert = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 // Match the remaining elements of the lower half.
12368 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12369 /* EMPTY */
12370 } else if ((!Base || (Base == V1)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12372 Base = V1;
12373 } else if ((!Base || (Base == V2)) &&
12374 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12375 Size + Hi)) {
12376 Base = V2;
12377 } else {
12378 continue;
12379 }
12380
12381 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12382 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12383 V1 = Base;
12384 V2 = Insert;
12385 return true;
12386 }
12387 }
12388
12389 return false;
12390}
12391
12392/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12394 SDValue V2, ArrayRef<int> Mask,
12395 const APInt &Zeroable, SelectionDAG &DAG) {
12396 uint64_t BitLen, BitIdx;
12397 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12398 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12399 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12400 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12401
12402 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12403 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12404 V2 ? V2 : DAG.getUNDEF(VT),
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 return SDValue();
12409}
12410
12411/// Lower a vector shuffle as an any/signed/zero extension.
12412///
12413/// Given a specific number of elements, element bit width, and extension
12414/// stride, produce either an extension based on the available
12415/// features of the subtarget. The extended elements are consecutive and
12416/// begin and can start from an offsetted element index in the input; to
12417/// avoid excess shuffling the offset must either being in the bottom lane
12418/// or at the start of a higher lane. All extended elements must be from
12419/// the same lane.
12421 int Scale, int Offset,
12422 unsigned ExtOpc, SDValue InputV,
12423 ArrayRef<int> Mask,
12424 const X86Subtarget &Subtarget,
12425 SelectionDAG &DAG) {
12426 assert(Scale > 1 && "Need a scale to extend.");
12427 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12428 int EltBits = VT.getScalarSizeInBits();
12429 int NumElements = VT.getVectorNumElements();
12430 int NumEltsPerLane = 128 / EltBits;
12431 int OffsetLane = Offset / NumEltsPerLane;
12432 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12433 "Only 8, 16, and 32 bit elements can be extended.");
12434 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12435 assert(0 <= Offset && "Extension offset must be positive.");
12436 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12437 "Extension offset must be in the first lane or start an upper lane.");
12438
12439 // Check that an index is in same lane as the base offset.
12440 auto SafeOffset = [&](int Idx) {
12441 return OffsetLane == (Idx / NumEltsPerLane);
12442 };
12443
12444 // Shift along an input so that the offset base moves to the first element.
12445 auto ShuffleOffset = [&](SDValue V) {
12446 if (!Offset)
12447 return V;
12448
12449 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12450 for (int i = 0; i * Scale < NumElements; ++i) {
12451 int SrcIdx = i + Offset;
12452 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12453 }
12454 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12455 };
12456
12457 // Found a valid a/zext mask! Try various lowering strategies based on the
12458 // input type and available ISA extensions.
12459 if (Subtarget.hasSSE41()) {
12460 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12461 // PUNPCK will catch this in a later shuffle match.
12462 if (Offset && Scale == 2 && VT.is128BitVector())
12463 return SDValue();
12464 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12465 NumElements / Scale);
12466 InputV = DAG.getBitcast(VT, InputV);
12467 InputV = ShuffleOffset(InputV);
12468 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12469 return DAG.getBitcast(VT, InputV);
12470 }
12471
12472 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12473 InputV = DAG.getBitcast(VT, InputV);
12474 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12475
12476 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12477 if (ExtOpc == ISD::SIGN_EXTEND)
12478 return SDValue();
12479
12480 // For any extends we can cheat for larger element sizes and use shuffle
12481 // instructions that can fold with a load and/or copy.
12482 if (AnyExt && EltBits == 32) {
12483 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12484 -1};
12485 return DAG.getBitcast(
12486 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12487 DAG.getBitcast(MVT::v4i32, InputV),
12488 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12489 }
12490 if (AnyExt && EltBits == 16 && Scale > 2) {
12491 int PSHUFDMask[4] = {Offset / 2, -1,
12492 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12493 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12494 DAG.getBitcast(MVT::v4i32, InputV),
12495 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12496 int PSHUFWMask[4] = {1, -1, -1, -1};
12497 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12498 return DAG.getBitcast(
12499 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12500 DAG.getBitcast(MVT::v8i16, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12502 }
12503
12504 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12505 // to 64-bits.
12506 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12507 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12508 assert(VT.is128BitVector() && "Unexpected vector width!");
12509
12510 int LoIdx = Offset * EltBits;
12511 SDValue Lo = DAG.getBitcast(
12512 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12513 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12514 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12515
12516 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12517 return DAG.getBitcast(VT, Lo);
12518
12519 int HiIdx = (Offset + 1) * EltBits;
12520 SDValue Hi = DAG.getBitcast(
12521 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12522 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12523 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12524 return DAG.getBitcast(VT,
12525 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12526 }
12527
12528 // If this would require more than 2 unpack instructions to expand, use
12529 // pshufb when available. We can only use more than 2 unpack instructions
12530 // when zero extending i8 elements which also makes it easier to use pshufb.
12531 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12532 assert(NumElements == 16 && "Unexpected byte vector width!");
12533 SDValue PSHUFBMask[16];
12534 for (int i = 0; i < 16; ++i) {
12535 int Idx = Offset + (i / Scale);
12536 if ((i % Scale == 0 && SafeOffset(Idx))) {
12537 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12538 continue;
12539 }
12540 PSHUFBMask[i] =
12541 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12542 }
12543 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12544 return DAG.getBitcast(
12545 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12546 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12547 }
12548
12549 // If we are extending from an offset, ensure we start on a boundary that
12550 // we can unpack from.
12551 int AlignToUnpack = Offset % (NumElements / Scale);
12552 if (AlignToUnpack) {
12553 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12554 for (int i = AlignToUnpack; i < NumElements; ++i)
12555 ShMask[i - AlignToUnpack] = i;
12556 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12557 Offset -= AlignToUnpack;
12558 }
12559
12560 // Otherwise emit a sequence of unpacks.
12561 do {
12562 unsigned UnpackLoHi = X86ISD::UNPCKL;
12563 if (Offset >= (NumElements / 2)) {
12564 UnpackLoHi = X86ISD::UNPCKH;
12565 Offset -= (NumElements / 2);
12566 }
12567
12568 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12569 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12570 : getZeroVector(InputVT, Subtarget, DAG, DL);
12571 InputV = DAG.getBitcast(InputVT, InputV);
12572 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12573 Scale /= 2;
12574 EltBits *= 2;
12575 NumElements /= 2;
12576 } while (Scale > 1);
12577 return DAG.getBitcast(VT, InputV);
12578}
12579
12580/// Try to lower a vector shuffle as a zero extension on any microarch.
12581///
12582/// This routine will try to do everything in its power to cleverly lower
12583/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12584/// check for the profitability of this lowering, it tries to aggressively
12585/// match this pattern. It will use all of the micro-architectural details it
12586/// can to emit an efficient lowering. It handles both blends with all-zero
12587/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12588/// masking out later).
12589///
12590/// The reason we have dedicated lowering for zext-style shuffles is that they
12591/// are both incredibly common and often quite performance sensitive.
12593 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12594 const APInt &Zeroable, const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 int Bits = VT.getSizeInBits();
12597 int NumLanes = Bits / 128;
12598 int NumElements = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElements / NumLanes;
12600 assert(VT.getScalarSizeInBits() <= 32 &&
12601 "Exceeds 32-bit integer zero extension limit");
12602 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12603
12604 // Define a helper function to check a particular ext-scale and lower to it if
12605 // valid.
12606 auto Lower = [&](int Scale) -> SDValue {
12607 SDValue InputV;
12608 bool AnyExt = true;
12609 int Offset = 0;
12610 int Matches = 0;
12611 for (int i = 0; i < NumElements; ++i) {
12612 int M = Mask[i];
12613 if (M < 0)
12614 continue; // Valid anywhere but doesn't tell us anything.
12615 if (i % Scale != 0) {
12616 // Each of the extended elements need to be zeroable.
12617 if (!Zeroable[i])
12618 return SDValue();
12619
12620 // We no longer are in the anyext case.
12621 AnyExt = false;
12622 continue;
12623 }
12624
12625 // Each of the base elements needs to be consecutive indices into the
12626 // same input vector.
12627 SDValue V = M < NumElements ? V1 : V2;
12628 M = M % NumElements;
12629 if (!InputV) {
12630 InputV = V;
12631 Offset = M - (i / Scale);
12632 } else if (InputV != V)
12633 return SDValue(); // Flip-flopping inputs.
12634
12635 // Offset must start in the lowest 128-bit lane or at the start of an
12636 // upper lane.
12637 // FIXME: Is it ever worth allowing a negative base offset?
12638 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12639 (Offset % NumEltsPerLane) == 0))
12640 return SDValue();
12641
12642 // If we are offsetting, all referenced entries must come from the same
12643 // lane.
12644 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12645 return SDValue();
12646
12647 if ((M % NumElements) != (Offset + (i / Scale)))
12648 return SDValue(); // Non-consecutive strided elements.
12649 Matches++;
12650 }
12651
12652 // If we fail to find an input, we have a zero-shuffle which should always
12653 // have already been handled.
12654 // FIXME: Maybe handle this here in case during blending we end up with one?
12655 if (!InputV)
12656 return SDValue();
12657
12658 // If we are offsetting, don't extend if we only match a single input, we
12659 // can always do better by using a basic PSHUF or PUNPCK.
12660 if (Offset != 0 && Matches < 2)
12661 return SDValue();
12662
12663 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12664 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12665 InputV, Mask, Subtarget, DAG);
12666 };
12667
12668 // The widest scale possible for extending is to a 64-bit integer.
12669 assert(Bits % 64 == 0 &&
12670 "The number of bits in a vector must be divisible by 64 on x86!");
12671 int NumExtElements = Bits / 64;
12672
12673 // Each iteration, try extending the elements half as much, but into twice as
12674 // many elements.
12675 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12676 assert(NumElements % NumExtElements == 0 &&
12677 "The input vector size must be divisible by the extended size.");
12678 if (SDValue V = Lower(NumElements / NumExtElements))
12679 return V;
12680 }
12681
12682 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12683 if (Bits != 128)
12684 return SDValue();
12685
12686 // Returns one of the source operands if the shuffle can be reduced to a
12687 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12688 auto CanZExtLowHalf = [&]() {
12689 for (int i = NumElements / 2; i != NumElements; ++i)
12690 if (!Zeroable[i])
12691 return SDValue();
12692 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12693 return V1;
12694 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12695 return V2;
12696 return SDValue();
12697 };
12698
12699 if (SDValue V = CanZExtLowHalf()) {
12700 V = DAG.getBitcast(MVT::v2i64, V);
12701 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12702 return DAG.getBitcast(VT, V);
12703 }
12704
12705 // No viable ext lowering found.
12706 return SDValue();
12707}
12708
12709/// Try to get a scalar value for a specific element of a vector.
12710///
12711/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12713 SelectionDAG &DAG) {
12714 MVT VT = V.getSimpleValueType();
12715 MVT EltVT = VT.getVectorElementType();
12716 V = peekThroughBitcasts(V);
12717
12718 // If the bitcasts shift the element size, we can't extract an equivalent
12719 // element from it.
12720 MVT NewVT = V.getSimpleValueType();
12721 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12722 return SDValue();
12723
12724 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12725 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12726 // Ensure the scalar operand is the same size as the destination.
12727 // FIXME: Add support for scalar truncation where possible.
12728 SDValue S = V.getOperand(Idx);
12729 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12730 return DAG.getBitcast(EltVT, S);
12731 }
12732
12733 return SDValue();
12734}
12735
12736/// Helper to test for a load that can be folded with x86 shuffles.
12737///
12738/// This is particularly important because the set of instructions varies
12739/// significantly based on whether the operand is a load or not.
12741 return V.hasOneUse() &&
12743}
12744
12745template<typename T>
12746static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12747 T EltVT = VT.getScalarType();
12748 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12749 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12750}
12751
12752/// Try to lower insertion of a single element into a zero vector.
12753///
12754/// This is a common pattern that we have especially efficient patterns to lower
12755/// across all subtarget feature sets.
12757 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12758 const APInt &Zeroable, const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 MVT ExtVT = VT;
12761 MVT EltVT = VT.getVectorElementType();
12762 unsigned NumElts = VT.getVectorNumElements();
12763 unsigned EltBits = VT.getScalarSizeInBits();
12764
12765 if (isSoftF16(EltVT, Subtarget))
12766 return SDValue();
12767
12768 int V2Index =
12769 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12770 Mask.begin();
12771 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12772 bool IsV1Zeroable = true;
12773 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12774 if (i != V2Index && !Zeroable[i]) {
12775 IsV1Zeroable = false;
12776 break;
12777 }
12778
12779 // Bail if a non-zero V1 isn't used in place.
12780 if (!IsV1Zeroable) {
12781 SmallVector<int, 8> V1Mask(Mask);
12782 V1Mask[V2Index] = -1;
12783 if (!isNoopShuffleMask(V1Mask))
12784 return SDValue();
12785 }
12786
12787 // Check for a single input from a SCALAR_TO_VECTOR node.
12788 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12789 // all the smarts here sunk into that routine. However, the current
12790 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12791 // vector shuffle lowering is dead.
12792 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12793 DAG);
12794 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12795 // We need to zext the scalar if it is smaller than an i32.
12796 V2S = DAG.getBitcast(EltVT, V2S);
12797 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12798 // Using zext to expand a narrow element won't work for non-zero
12799 // insertions. But we can use a masked constant vector if we're
12800 // inserting V2 into the bottom of V1.
12801 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12802 return SDValue();
12803
12804 // Zero-extend directly to i32.
12805 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12806 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12807
12808 // If we're inserting into a constant, mask off the inserted index
12809 // and OR with the zero-extended scalar.
12810 if (!IsV1Zeroable) {
12811 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12812 Bits[V2Index] = APInt::getZero(EltBits);
12813 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12814 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12815 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12816 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12817 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12818 }
12819 }
12820 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12821 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12822 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12823 // Either not inserting from the low element of the input or the input
12824 // element size is too small to use VZEXT_MOVL to clear the high bits.
12825 return SDValue();
12826 }
12827
12828 if (!IsV1Zeroable) {
12829 // If V1 can't be treated as a zero vector we have fewer options to lower
12830 // this. We can't support integer vectors or non-zero targets cheaply.
12831 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12832 if (!VT.isFloatingPoint() || V2Index != 0)
12833 return SDValue();
12834 if (!VT.is128BitVector())
12835 return SDValue();
12836
12837 // Otherwise, use MOVSD, MOVSS or MOVSH.
12838 unsigned MovOpc = 0;
12839 if (EltVT == MVT::f16)
12840 MovOpc = X86ISD::MOVSH;
12841 else if (EltVT == MVT::f32)
12842 MovOpc = X86ISD::MOVSS;
12843 else if (EltVT == MVT::f64)
12844 MovOpc = X86ISD::MOVSD;
12845 else
12846 llvm_unreachable("Unsupported floating point element type to handle!");
12847 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12848 }
12849
12850 // This lowering only works for the low element with floating point vectors.
12851 if (VT.isFloatingPoint() && V2Index != 0)
12852 return SDValue();
12853
12854 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12855 if (ExtVT != VT)
12856 V2 = DAG.getBitcast(VT, V2);
12857
12858 if (V2Index != 0) {
12859 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12860 // the desired position. Otherwise it is more efficient to do a vector
12861 // shift left. We know that we can do a vector shift left because all
12862 // the inputs are zero.
12863 if (VT.isFloatingPoint() || NumElts <= 4) {
12864 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12865 V2Shuffle[V2Index] = 0;
12866 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12867 } else {
12868 V2 = DAG.getBitcast(MVT::v16i8, V2);
12869 V2 = DAG.getNode(
12870 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12871 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12872 V2 = DAG.getBitcast(VT, V2);
12873 }
12874 }
12875 return V2;
12876}
12877
12878/// Try to lower broadcast of a single - truncated - integer element,
12879/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12880///
12881/// This assumes we have AVX2.
12883 int BroadcastIdx,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(Subtarget.hasAVX2() &&
12887 "We can only lower integer broadcasts with AVX2!");
12888
12889 MVT EltVT = VT.getVectorElementType();
12890 MVT V0VT = V0.getSimpleValueType();
12891
12892 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12893 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12894
12895 MVT V0EltVT = V0VT.getVectorElementType();
12896 if (!V0EltVT.isInteger())
12897 return SDValue();
12898
12899 const unsigned EltSize = EltVT.getSizeInBits();
12900 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12901
12902 // This is only a truncation if the original element type is larger.
12903 if (V0EltSize <= EltSize)
12904 return SDValue();
12905
12906 assert(((V0EltSize % EltSize) == 0) &&
12907 "Scalar type sizes must all be powers of 2 on x86!");
12908
12909 const unsigned V0Opc = V0.getOpcode();
12910 const unsigned Scale = V0EltSize / EltSize;
12911 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12912
12913 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12914 V0Opc != ISD::BUILD_VECTOR)
12915 return SDValue();
12916
12917 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12918
12919 // If we're extracting non-least-significant bits, shift so we can truncate.
12920 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12921 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12922 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12923 if (const int OffsetIdx = BroadcastIdx % Scale)
12924 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12925 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12926
12927 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12928 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12929}
12930
12931/// Test whether this can be lowered with a single SHUFPS instruction.
12932///
12933/// This is used to disable more specialized lowerings when the shufps lowering
12934/// will happen to be efficient.
12936 // This routine only handles 128-bit shufps.
12937 assert(Mask.size() == 4 && "Unsupported mask size!");
12938 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12939 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12940 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12941 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12942
12943 // To lower with a single SHUFPS we need to have the low half and high half
12944 // each requiring a single input.
12945 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12946 return false;
12947 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12948 return false;
12949
12950 return true;
12951}
12952
12953/// Test whether the specified input (0 or 1) is in-place blended by the
12954/// given mask.
12955///
12956/// This returns true if the elements from a particular input are already in the
12957/// slot required by the given mask and require no permutation.
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12963 return false;
12964
12965 return true;
12966}
12967
12968/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12969/// the given mask.
12970///
12972 int BroadcastableElement = 0) {
12973 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12974 int Size = Mask.size();
12975 for (int i = 0; i < Size; ++i)
12976 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12977 Mask[i] % Size != BroadcastableElement)
12978 return false;
12979 return true;
12980}
12981
12982/// If we are extracting two 128-bit halves of a vector and shuffling the
12983/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12984/// multi-shuffle lowering.
12986 SDValue N1, ArrayRef<int> Mask,
12987 SelectionDAG &DAG) {
12988 MVT VT = N0.getSimpleValueType();
12989 assert((VT.is128BitVector() &&
12990 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12991 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12992
12993 // Check that both sources are extracts of the same source vector.
12994 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12996 N0.getOperand(0) != N1.getOperand(0) ||
12997 !N0.hasOneUse() || !N1.hasOneUse())
12998 return SDValue();
12999
13000 SDValue WideVec = N0.getOperand(0);
13001 MVT WideVT = WideVec.getSimpleValueType();
13002 if (!WideVT.is256BitVector())
13003 return SDValue();
13004
13005 // Match extracts of each half of the wide source vector. Commute the shuffle
13006 // if the extract of the low half is N1.
13007 unsigned NumElts = VT.getVectorNumElements();
13008 SmallVector<int, 4> NewMask(Mask);
13009 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13010 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13011 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13013 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13014 return SDValue();
13015
13016 // Final bailout: if the mask is simple, we are better off using an extract
13017 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13018 // because that avoids a constant load from memory.
13019 if (NumElts == 4 &&
13020 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13021 return SDValue();
13022
13023 // Extend the shuffle mask with undef elements.
13024 NewMask.append(NumElts, -1);
13025
13026 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13027 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13028 NewMask);
13029 // This is free: ymm -> xmm.
13030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13031 DAG.getVectorIdxConstant(0, DL));
13032}
13033
13034/// Try to lower broadcast of a single element.
13035///
13036/// For convenience, this code also bundles all of the subtarget feature set
13037/// filtering. While a little annoying to re-dispatch on type here, there isn't
13038/// a convenient way to factor it out.
13040 SDValue V2, ArrayRef<int> Mask,
13041 const X86Subtarget &Subtarget,
13042 SelectionDAG &DAG) {
13043 MVT EltVT = VT.getVectorElementType();
13044 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13045 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13046 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13047 return SDValue();
13048
13049 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13050 // we can only broadcast from a register with AVX2.
13051 unsigned NumEltBits = VT.getScalarSizeInBits();
13052 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13055 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13056
13057 // Check that the mask is a broadcast.
13058 int BroadcastIdx = getSplatIndex(Mask);
13059 if (BroadcastIdx < 0) {
13060 // Check for hidden broadcast.
13061 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13062 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13063 return SDValue();
13064 BroadcastIdx = 0;
13065 }
13066 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13067 "a sorted mask where the broadcast "
13068 "comes from V1.");
13069 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13070
13071 // Go up the chain of (vector) values to find a scalar load that we can
13072 // combine with the broadcast.
13073 // TODO: Combine this logic with findEltLoadSrc() used by
13074 // EltsFromConsecutiveLoads().
13075 int BitOffset = BroadcastIdx * NumEltBits;
13076 SDValue V = V1;
13077 for (;;) {
13078 switch (V.getOpcode()) {
13079 case ISD::BITCAST: {
13080 V = V.getOperand(0);
13081 continue;
13082 }
13083 case ISD::CONCAT_VECTORS: {
13084 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13085 int OpIdx = BitOffset / OpBitWidth;
13086 V = V.getOperand(OpIdx);
13087 BitOffset %= OpBitWidth;
13088 continue;
13089 }
13091 // The extraction index adds to the existing offset.
13092 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13093 unsigned Idx = V.getConstantOperandVal(1);
13094 unsigned BeginOffset = Idx * EltBitWidth;
13095 BitOffset += BeginOffset;
13096 V = V.getOperand(0);
13097 continue;
13098 }
13099 case ISD::INSERT_SUBVECTOR: {
13100 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13101 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13102 int Idx = (int)V.getConstantOperandVal(2);
13103 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13104 int BeginOffset = Idx * EltBitWidth;
13105 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13106 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13107 BitOffset -= BeginOffset;
13108 V = VInner;
13109 } else {
13110 V = VOuter;
13111 }
13112 continue;
13113 }
13114 }
13115 break;
13116 }
13117 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13118 BroadcastIdx = BitOffset / NumEltBits;
13119
13120 // Do we need to bitcast the source to retrieve the original broadcast index?
13121 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13122
13123 // Check if this is a broadcast of a scalar. We special case lowering
13124 // for scalars so that we can more effectively fold with loads.
13125 // If the original value has a larger element type than the shuffle, the
13126 // broadcast element is in essence truncated. Make that explicit to ease
13127 // folding.
13128 if (BitCastSrc && VT.isInteger())
13129 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13130 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13131 return TruncBroadcast;
13132
13133 // Also check the simpler case, where we can directly reuse the scalar.
13134 if (!BitCastSrc &&
13135 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13136 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13137 V = V.getOperand(BroadcastIdx);
13138
13139 // If we can't broadcast from a register, check that the input is a load.
13140 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13141 return SDValue();
13142 } else if (ISD::isNormalLoad(V.getNode()) &&
13143 cast<LoadSDNode>(V)->isSimple()) {
13144 // We do not check for one-use of the vector load because a broadcast load
13145 // is expected to be a win for code size, register pressure, and possibly
13146 // uops even if the original vector load is not eliminated.
13147
13148 // Reduce the vector load and shuffle to a broadcasted scalar load.
13149 auto *Ld = cast<LoadSDNode>(V);
13150 SDValue BaseAddr = Ld->getBasePtr();
13151 MVT SVT = VT.getScalarType();
13152 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13153 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13154 SDValue NewAddr =
13156
13157 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13158 // than MOVDDUP.
13159 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13160 if (Opcode == X86ISD::VBROADCAST) {
13161 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13162 SDValue Ops[] = {Ld->getChain(), NewAddr};
13163 V = DAG.getMemIntrinsicNode(
13164 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13166 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13168 return DAG.getBitcast(VT, V);
13169 }
13170 assert(SVT == MVT::f64 && "Unexpected VT!");
13171 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13173 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13175 } else if (!BroadcastFromReg) {
13176 // We can't broadcast from a vector register.
13177 return SDValue();
13178 } else if (BitOffset != 0) {
13179 // We can only broadcast from the zero-element of a vector register,
13180 // but it can be advantageous to broadcast from the zero-element of a
13181 // subvector.
13182 if (!VT.is256BitVector() && !VT.is512BitVector())
13183 return SDValue();
13184
13185 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13186 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13187 return SDValue();
13188
13189 // If we are broadcasting an element from the lowest 128-bit subvector, try
13190 // to move the element in position.
13191 if (BitOffset < 128 && NumActiveElts > 1 &&
13192 V.getScalarValueSizeInBits() == NumEltBits) {
13193 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13194 "Unexpected bit-offset");
13195 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13196 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13197 V = extractSubVector(V, 0, DAG, DL, 128);
13198 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13199 } else {
13200 // Only broadcast the zero-element of a 128-bit subvector.
13201 if ((BitOffset % 128) != 0)
13202 return SDValue();
13203
13204 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13205 "Unexpected bit-offset");
13206 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13207 "Unexpected vector size");
13208 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13209 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13210 }
13211 }
13212
13213 // On AVX we can use VBROADCAST directly for scalar sources.
13214 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13215 V = DAG.getBitcast(MVT::f64, V);
13216 if (Subtarget.hasAVX()) {
13217 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13218 return DAG.getBitcast(VT, V);
13219 }
13220 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13221 }
13222
13223 // If this is a scalar, do the broadcast on this type and bitcast.
13224 if (!V.getValueType().isVector()) {
13225 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13226 "Unexpected scalar size");
13227 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13229 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13230 }
13231
13232 // We only support broadcasting from 128-bit vectors to minimize the
13233 // number of patterns we need to deal with in isel. So extract down to
13234 // 128-bits, removing as many bitcasts as possible.
13235 if (V.getValueSizeInBits() > 128)
13237
13238 // Otherwise cast V to a vector with the same element type as VT, but
13239 // possibly narrower than VT. Then perform the broadcast.
13240 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13241 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13242 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13243}
13244
13245// Check for whether we can use INSERTPS to perform the shuffle. We only use
13246// INSERTPS when the V1 elements are already in the correct locations
13247// because otherwise we can just always use two SHUFPS instructions which
13248// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13249// perform INSERTPS if a single V1 element is out of place and all V2
13250// elements are zeroable.
13252 unsigned &InsertPSMask,
13253 const APInt &Zeroable,
13254 ArrayRef<int> Mask, SelectionDAG &DAG) {
13255 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13257 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13258
13259 // Attempt to match INSERTPS with one element from VA or VB being
13260 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13261 // are updated.
13262 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13263 ArrayRef<int> CandidateMask) {
13264 unsigned ZMask = 0;
13265 int VADstIndex = -1;
13266 int VBDstIndex = -1;
13267 bool VAUsedInPlace = false;
13268
13269 for (int i = 0; i < 4; ++i) {
13270 // Synthesize a zero mask from the zeroable elements (includes undefs).
13271 if (Zeroable[i]) {
13272 ZMask |= 1 << i;
13273 continue;
13274 }
13275
13276 // Flag if we use any VA inputs in place.
13277 if (i == CandidateMask[i]) {
13278 VAUsedInPlace = true;
13279 continue;
13280 }
13281
13282 // We can only insert a single non-zeroable element.
13283 if (VADstIndex >= 0 || VBDstIndex >= 0)
13284 return false;
13285
13286 if (CandidateMask[i] < 4) {
13287 // VA input out of place for insertion.
13288 VADstIndex = i;
13289 } else {
13290 // VB input for insertion.
13291 VBDstIndex = i;
13292 }
13293 }
13294
13295 // Don't bother if we have no (non-zeroable) element for insertion.
13296 if (VADstIndex < 0 && VBDstIndex < 0)
13297 return false;
13298
13299 // Determine element insertion src/dst indices. The src index is from the
13300 // start of the inserted vector, not the start of the concatenated vector.
13301 unsigned VBSrcIndex = 0;
13302 if (VADstIndex >= 0) {
13303 // If we have a VA input out of place, we use VA as the V2 element
13304 // insertion and don't use the original V2 at all.
13305 VBSrcIndex = CandidateMask[VADstIndex];
13306 VBDstIndex = VADstIndex;
13307 VB = VA;
13308 } else {
13309 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13310 }
13311
13312 // If no V1 inputs are used in place, then the result is created only from
13313 // the zero mask and the V2 insertion - so remove V1 dependency.
13314 if (!VAUsedInPlace)
13315 VA = DAG.getUNDEF(MVT::v4f32);
13316
13317 // Update V1, V2 and InsertPSMask accordingly.
13318 V1 = VA;
13319 V2 = VB;
13320
13321 // Insert the V2 element into the desired position.
13322 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13323 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13324 return true;
13325 };
13326
13327 if (matchAsInsertPS(V1, V2, Mask))
13328 return true;
13329
13330 // Commute and try again.
13331 SmallVector<int, 4> CommutedMask(Mask);
13333 if (matchAsInsertPS(V2, V1, CommutedMask))
13334 return true;
13335
13336 return false;
13337}
13338
13340 ArrayRef<int> Mask, const APInt &Zeroable,
13341 SelectionDAG &DAG) {
13342 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13344
13345 // Attempt to match the insertps pattern.
13346 unsigned InsertPSMask = 0;
13347 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13348 return SDValue();
13349
13350 // Insert the V2 element into the desired position.
13351 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13352 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13353}
13354
13355/// Handle lowering of 2-lane 64-bit floating point shuffles.
13356///
13357/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13358/// support for floating point shuffles but not integer shuffles. These
13359/// instructions will incur a domain crossing penalty on some chips though so
13360/// it is better to avoid lowering through this for integer vectors where
13361/// possible.
13363 const APInt &Zeroable, SDValue V1, SDValue V2,
13364 const X86Subtarget &Subtarget,
13365 SelectionDAG &DAG) {
13366 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13368 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13369
13370 if (V2.isUndef()) {
13371 // Check for being able to broadcast a single element.
13372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13373 Mask, Subtarget, DAG))
13374 return Broadcast;
13375
13376 // Straight shuffle of a single input vector. Simulate this by using the
13377 // single input as both of the "inputs" to this instruction..
13378 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13379
13380 if (Subtarget.hasAVX()) {
13381 // If we have AVX, we can use VPERMILPS which will allow folding a load
13382 // into the shuffle.
13383 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13385 }
13386
13387 return DAG.getNode(
13388 X86ISD::SHUFP, DL, MVT::v2f64,
13389 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13391 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13392 }
13393 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13395 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13396 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13397
13398 if (Subtarget.hasAVX2())
13399 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13400 return Extract;
13401
13402 // When loading a scalar and then shuffling it into a vector we can often do
13403 // the insertion cheaply.
13405 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13406 return Insertion;
13407 // Try inverting the insertion since for v2 masks it is easy to do and we
13408 // can't reliably sort the mask one way or the other.
13409 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13410 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13412 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // Try to use one of the special instruction patterns to handle two common
13416 // blend patterns if a zero-blend above didn't work.
13417 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13418 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13419 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13420 // We can either use a special instruction to load over the low double or
13421 // to move just the low double.
13422 return DAG.getNode(
13423 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13424 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13425
13426 if (Subtarget.hasSSE41())
13427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13429 return Blend;
13430
13431 // Use dedicated unpack instructions for masks that match their pattern.
13432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13433 return V;
13434
13435 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13436 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13438}
13439
13440/// Handle lowering of 2-lane 64-bit integer shuffles.
13441///
13442/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13443/// the integer unit to minimize domain crossing penalties. However, for blends
13444/// it falls back to the floating point shuffle operation with appropriate bit
13445/// casting.
13447 const APInt &Zeroable, SDValue V1, SDValue V2,
13448 const X86Subtarget &Subtarget,
13449 SelectionDAG &DAG) {
13450 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13452 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13453
13454 if (V2.isUndef()) {
13455 // Check for being able to broadcast a single element.
13456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13457 Mask, Subtarget, DAG))
13458 return Broadcast;
13459
13460 // Straight shuffle of a single input vector. For everything from SSE2
13461 // onward this has a single fast instruction with no scary immediates.
13462 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13463 V1 = DAG.getBitcast(MVT::v4i32, V1);
13464 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13465 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13466 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13467 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13468 return DAG.getBitcast(
13469 MVT::v2i64,
13470 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13471 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13472 }
13473 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13475 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13476 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13477
13478 if (Subtarget.hasAVX2())
13479 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13480 return Extract;
13481
13482 // Try to use shift instructions.
13483 if (SDValue Shift =
13484 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13485 DAG, /*BitwiseOnly*/ false))
13486 return Shift;
13487
13488 // When loading a scalar and then shuffling it into a vector we can often do
13489 // the insertion cheaply.
13491 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13492 return Insertion;
13493 // Try inverting the insertion since for v2 masks it is easy to do and we
13494 // can't reliably sort the mask one way or the other.
13495 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13497 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499
13500 // We have different paths for blend lowering, but they all must use the
13501 // *exact* same predicate.
13502 bool IsBlendSupported = Subtarget.hasSSE41();
13503 if (IsBlendSupported)
13504 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13505 Zeroable, Subtarget, DAG))
13506 return Blend;
13507
13508 // Use dedicated unpack instructions for masks that match their pattern.
13509 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13510 return V;
13511
13512 // Try to use byte rotation instructions.
13513 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13514 if (Subtarget.hasSSSE3()) {
13515 if (Subtarget.hasVLX())
13516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13517 Zeroable, Subtarget, DAG))
13518 return Rotate;
13519
13520 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13521 Subtarget, DAG))
13522 return Rotate;
13523 }
13524
13525 // If we have direct support for blends, we should lower by decomposing into
13526 // a permute. That will be faster than the domain cross.
13527 if (IsBlendSupported)
13528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13529 Zeroable, Subtarget, DAG);
13530
13531 // We implement this with SHUFPD which is pretty lame because it will likely
13532 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13533 // However, all the alternatives are still more cycles and newer chips don't
13534 // have this problem. It would be really nice if x86 had better shuffles here.
13535 V1 = DAG.getBitcast(MVT::v2f64, V1);
13536 V2 = DAG.getBitcast(MVT::v2f64, V2);
13537 return DAG.getBitcast(MVT::v2i64,
13538 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13539}
13540
13541/// Lower a vector shuffle using the SHUFPS instruction.
13542///
13543/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13544/// It makes no assumptions about whether this is the *best* lowering, it simply
13545/// uses it.
13547 ArrayRef<int> Mask, SDValue V1,
13548 SDValue V2, SelectionDAG &DAG) {
13549 SDValue LowV = V1, HighV = V2;
13550 SmallVector<int, 4> NewMask(Mask);
13551 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13552
13553 if (NumV2Elements == 1) {
13554 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13555
13556 // Compute the index adjacent to V2Index and in the same half by toggling
13557 // the low bit.
13558 int V2AdjIndex = V2Index ^ 1;
13559
13560 if (Mask[V2AdjIndex] < 0) {
13561 // Handles all the cases where we have a single V2 element and an undef.
13562 // This will only ever happen in the high lanes because we commute the
13563 // vector otherwise.
13564 if (V2Index < 2)
13565 std::swap(LowV, HighV);
13566 NewMask[V2Index] -= 4;
13567 } else {
13568 // Handle the case where the V2 element ends up adjacent to a V1 element.
13569 // To make this work, blend them together as the first step.
13570 int V1Index = V2AdjIndex;
13571 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13572 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13573 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13574
13575 // Now proceed to reconstruct the final blend as we have the necessary
13576 // high or low half formed.
13577 if (V2Index < 2) {
13578 LowV = V2;
13579 HighV = V1;
13580 } else {
13581 HighV = V2;
13582 }
13583 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13584 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13585 }
13586 } else if (NumV2Elements == 2) {
13587 if (Mask[0] < 4 && Mask[1] < 4) {
13588 // Handle the easy case where we have V1 in the low lanes and V2 in the
13589 // high lanes.
13590 NewMask[2] -= 4;
13591 NewMask[3] -= 4;
13592 } else if (Mask[2] < 4 && Mask[3] < 4) {
13593 // We also handle the reversed case because this utility may get called
13594 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13595 // arrange things in the right direction.
13596 NewMask[0] -= 4;
13597 NewMask[1] -= 4;
13598 HighV = V1;
13599 LowV = V2;
13600 } else {
13601 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13602 // trying to place elements directly, just blend them and set up the final
13603 // shuffle to place them.
13604
13605 // The first two blend mask elements are for V1, the second two are for
13606 // V2.
13607 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13608 Mask[2] < 4 ? Mask[2] : Mask[3],
13609 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13610 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13611 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13612 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13613
13614 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13615 // a blend.
13616 LowV = HighV = V1;
13617 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13618 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13619 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13620 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13621 }
13622 } else if (NumV2Elements == 3) {
13623 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13624 // we can get here due to other paths (e.g repeated mask matching) that we
13625 // don't want to do another round of lowerVECTOR_SHUFFLE.
13627 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13628 }
13629 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13630 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13631}
13632
13633/// Lower 4-lane 32-bit floating point shuffles.
13634///
13635/// Uses instructions exclusively from the floating point unit to minimize
13636/// domain crossing penalties, as these are sufficient to implement all v4f32
13637/// shuffles.
13639 const APInt &Zeroable, SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13645
13646 if (Subtarget.hasSSE41())
13647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13648 Zeroable, Subtarget, DAG))
13649 return Blend;
13650
13651 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13652
13653 if (NumV2Elements == 0) {
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13656 Mask, Subtarget, DAG))
13657 return Broadcast;
13658
13659 // Use even/odd duplicate instructions for masks that match their pattern.
13660 if (Subtarget.hasSSE3()) {
13661 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13662 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13663 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13665 }
13666
13667 if (Subtarget.hasAVX()) {
13668 // If we have AVX, we can use VPERMILPS which will allow folding a load
13669 // into the shuffle.
13670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13671 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13672 }
13673
13674 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13675 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13676 if (!Subtarget.hasSSE2()) {
13677 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13678 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13679 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13680 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13681 }
13682
13683 // Otherwise, use a straight shuffle of a single input vector. We pass the
13684 // input vector to both operands to simulate this with a SHUFPS.
13685 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13687 }
13688
13689 if (Subtarget.hasSSE2())
13691 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13692 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13693 return ZExt;
13694 }
13695
13696 if (Subtarget.hasAVX2())
13697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13698 return Extract;
13699
13700 // There are special ways we can lower some single-element blends. However, we
13701 // have custom ways we can lower more complex single-element blends below that
13702 // we defer to if both this and BLENDPS fail to match, so restrict this to
13703 // when the V2 input is targeting element 0 of the mask -- that is the fast
13704 // case here.
13705 if (NumV2Elements == 1 && Mask[0] >= 4)
13707 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13708 return V;
13709
13710 if (Subtarget.hasSSE41()) {
13711 // Use INSERTPS if we can complete the shuffle efficiently.
13712 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13713 return V;
13714
13715 if (!isSingleSHUFPSMask(Mask))
13716 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13717 V2, Mask, DAG))
13718 return BlendPerm;
13719 }
13720
13721 // Use low/high mov instructions. These are only valid in SSE1 because
13722 // otherwise they are widened to v2f64 and never get here.
13723 if (!Subtarget.hasSSE2()) {
13724 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13725 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13726 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13727 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13728 }
13729
13730 // Use dedicated unpack instructions for masks that match their pattern.
13731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13732 return V;
13733
13734 // Otherwise fall back to a SHUFPS lowering strategy.
13735 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13736}
13737
13738/// Lower 4-lane i32 vector shuffles.
13739///
13740/// We try to handle these with integer-domain shuffles where we can, but for
13741/// blends we use the floating point domain blend instructions.
13743 const APInt &Zeroable, SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749
13750 // Whenever we can lower this as a zext, that instruction is strictly faster
13751 // than any alternative. It also allows us to fold memory operands into the
13752 // shuffle in many cases.
13753 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13754 Zeroable, Subtarget, DAG))
13755 return ZExt;
13756
13757 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13758
13759 // Try to use shift instructions if fast.
13760 if (Subtarget.preferLowerShuffleAsShift()) {
13761 if (SDValue Shift =
13762 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13763 Subtarget, DAG, /*BitwiseOnly*/ true))
13764 return Shift;
13765 if (NumV2Elements == 0)
13766 if (SDValue Rotate =
13767 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13768 return Rotate;
13769 }
13770
13771 if (NumV2Elements == 0) {
13772 // Try to use broadcast unless the mask only has one non-undef element.
13773 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13774 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13775 Mask, Subtarget, DAG))
13776 return Broadcast;
13777 }
13778
13779 // Straight shuffle of a single input vector. For everything from SSE2
13780 // onward this has a single fast instruction with no scary immediates.
13781 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13782 // but we aren't actually going to use the UNPCK instruction because doing
13783 // so prevents folding a load into this instruction or making a copy.
13784 const int UnpackLoMask[] = {0, 0, 1, 1};
13785 const int UnpackHiMask[] = {2, 2, 3, 3};
13786 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13787 Mask = UnpackLoMask;
13788 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13789 Mask = UnpackHiMask;
13790
13791 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13792 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13793 }
13794
13795 if (Subtarget.hasAVX2())
13796 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13797 return Extract;
13798
13799 // Try to use shift instructions.
13800 if (SDValue Shift =
13801 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13802 DAG, /*BitwiseOnly*/ false))
13803 return Shift;
13804
13805 // There are special ways we can lower some single-element blends.
13806 if (NumV2Elements == 1)
13808 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13809 return V;
13810
13811 // We have different paths for blend lowering, but they all must use the
13812 // *exact* same predicate.
13813 bool IsBlendSupported = Subtarget.hasSSE41();
13814 if (IsBlendSupported)
13815 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13816 Zeroable, Subtarget, DAG))
13817 return Blend;
13818
13819 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13820 Zeroable, Subtarget, DAG))
13821 return Masked;
13822
13823 // Use dedicated unpack instructions for masks that match their pattern.
13824 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13825 return V;
13826
13827 // Try to use byte rotation instructions.
13828 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13829 if (Subtarget.hasSSSE3()) {
13830 if (Subtarget.hasVLX())
13831 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13832 Zeroable, Subtarget, DAG))
13833 return Rotate;
13834
13835 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13836 Subtarget, DAG))
13837 return Rotate;
13838 }
13839
13840 // Assume that a single SHUFPS is faster than an alternative sequence of
13841 // multiple instructions (even if the CPU has a domain penalty).
13842 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13843 if (!isSingleSHUFPSMask(Mask)) {
13844 // If we have direct support for blends, we should lower by decomposing into
13845 // a permute. That will be faster than the domain cross.
13846 if (IsBlendSupported)
13847 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13848 Zeroable, Subtarget, DAG);
13849
13850 // Try to lower by permuting the inputs into an unpack instruction.
13851 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13852 Mask, Subtarget, DAG))
13853 return Unpack;
13854 }
13855
13856 // We implement this with SHUFPS because it can blend from two vectors.
13857 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13858 // up the inputs, bypassing domain shift penalties that we would incur if we
13859 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13860 // relevant.
13861 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13862 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13863 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13864 return DAG.getBitcast(MVT::v4i32, ShufPS);
13865}
13866
13867/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13868/// shuffle lowering, and the most complex part.
13869///
13870/// The lowering strategy is to try to form pairs of input lanes which are
13871/// targeted at the same half of the final vector, and then use a dword shuffle
13872/// to place them onto the right half, and finally unpack the paired lanes into
13873/// their final position.
13874///
13875/// The exact breakdown of how to form these dword pairs and align them on the
13876/// correct sides is really tricky. See the comments within the function for
13877/// more of the details.
13878///
13879/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13880/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13881/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13882/// vector, form the analogous 128-bit 8-element Mask.
13884 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13885 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13886 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13887 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13888
13889 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13890 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13891 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13892
13893 // Attempt to directly match PSHUFLW or PSHUFHW.
13894 if (isUndefOrInRange(LoMask, 0, 4) &&
13895 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13896 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13897 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13898 }
13899 if (isUndefOrInRange(HiMask, 4, 8) &&
13900 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13901 for (int i = 0; i != 4; ++i)
13902 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13903 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13905 }
13906
13907 SmallVector<int, 4> LoInputs;
13908 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13909 array_pod_sort(LoInputs.begin(), LoInputs.end());
13910 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13911 SmallVector<int, 4> HiInputs;
13912 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13913 array_pod_sort(HiInputs.begin(), HiInputs.end());
13914 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13915 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13916 int NumHToL = LoInputs.size() - NumLToL;
13917 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13918 int NumHToH = HiInputs.size() - NumLToH;
13919 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13920 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13921 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13922 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13923
13924 // If we are shuffling values from one half - check how many different DWORD
13925 // pairs we need to create. If only 1 or 2 then we can perform this as a
13926 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13927 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13928 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13929 V = DAG.getNode(ShufWOp, DL, VT, V,
13930 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13931 V = DAG.getBitcast(PSHUFDVT, V);
13932 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13933 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13934 return DAG.getBitcast(VT, V);
13935 };
13936
13937 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13938 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13939 SmallVector<std::pair<int, int>, 4> DWordPairs;
13940 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13941
13942 // Collect the different DWORD pairs.
13943 for (int DWord = 0; DWord != 4; ++DWord) {
13944 int M0 = Mask[2 * DWord + 0];
13945 int M1 = Mask[2 * DWord + 1];
13946 M0 = (M0 >= 0 ? M0 % 4 : M0);
13947 M1 = (M1 >= 0 ? M1 % 4 : M1);
13948 if (M0 < 0 && M1 < 0)
13949 continue;
13950
13951 bool Match = false;
13952 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13953 auto &DWordPair = DWordPairs[j];
13954 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13955 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13956 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13957 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13958 PSHUFDMask[DWord] = DOffset + j;
13959 Match = true;
13960 break;
13961 }
13962 }
13963 if (!Match) {
13964 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13965 DWordPairs.push_back(std::make_pair(M0, M1));
13966 }
13967 }
13968
13969 if (DWordPairs.size() <= 2) {
13970 DWordPairs.resize(2, std::make_pair(-1, -1));
13971 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13972 DWordPairs[1].first, DWordPairs[1].second};
13973 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13974 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13975 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13976 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13977 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13978 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13979 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13980 }
13981 if ((NumHToL + NumHToH) == 0)
13982 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13983 if ((NumLToL + NumLToH) == 0)
13984 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13985 }
13986 }
13987
13988 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13989 // such inputs we can swap two of the dwords across the half mark and end up
13990 // with <=2 inputs to each half in each half. Once there, we can fall through
13991 // to the generic code below. For example:
13992 //
13993 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13994 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13995 //
13996 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13997 // and an existing 2-into-2 on the other half. In this case we may have to
13998 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13999 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14000 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14001 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14002 // half than the one we target for fixing) will be fixed when we re-enter this
14003 // path. We will also combine away any sequence of PSHUFD instructions that
14004 // result into a single instruction. Here is an example of the tricky case:
14005 //
14006 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14007 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14008 //
14009 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14010 //
14011 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14012 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14013 //
14014 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14015 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14016 //
14017 // The result is fine to be handled by the generic logic.
14018 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14019 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14020 int AOffset, int BOffset) {
14021 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14022 "Must call this with A having 3 or 1 inputs from the A half.");
14023 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14024 "Must call this with B having 1 or 3 inputs from the B half.");
14025 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14026 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14027
14028 bool ThreeAInputs = AToAInputs.size() == 3;
14029
14030 // Compute the index of dword with only one word among the three inputs in
14031 // a half by taking the sum of the half with three inputs and subtracting
14032 // the sum of the actual three inputs. The difference is the remaining
14033 // slot.
14034 int ADWord = 0, BDWord = 0;
14035 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14036 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14037 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14038 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14039 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14040 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14041 int TripleNonInputIdx =
14042 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14043 TripleDWord = TripleNonInputIdx / 2;
14044
14045 // We use xor with one to compute the adjacent DWord to whichever one the
14046 // OneInput is in.
14047 OneInputDWord = (OneInput / 2) ^ 1;
14048
14049 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14050 // and BToA inputs. If there is also such a problem with the BToB and AToB
14051 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14052 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14053 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14054 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14055 // Compute how many inputs will be flipped by swapping these DWords. We
14056 // need
14057 // to balance this to ensure we don't form a 3-1 shuffle in the other
14058 // half.
14059 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14060 llvm::count(AToBInputs, 2 * ADWord + 1);
14061 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14062 llvm::count(BToBInputs, 2 * BDWord + 1);
14063 if ((NumFlippedAToBInputs == 1 &&
14064 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14065 (NumFlippedBToBInputs == 1 &&
14066 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14067 // We choose whether to fix the A half or B half based on whether that
14068 // half has zero flipped inputs. At zero, we may not be able to fix it
14069 // with that half. We also bias towards fixing the B half because that
14070 // will more commonly be the high half, and we have to bias one way.
14071 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14072 ArrayRef<int> Inputs) {
14073 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14074 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14075 // Determine whether the free index is in the flipped dword or the
14076 // unflipped dword based on where the pinned index is. We use this bit
14077 // in an xor to conditionally select the adjacent dword.
14078 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14079 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14080 if (IsFixIdxInput == IsFixFreeIdxInput)
14081 FixFreeIdx += 1;
14082 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14083 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14084 "We need to be changing the number of flipped inputs!");
14085 int PSHUFHalfMask[] = {0, 1, 2, 3};
14086 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14087 V = DAG.getNode(
14088 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14089 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14090 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14091
14092 for (int &M : Mask)
14093 if (M >= 0 && M == FixIdx)
14094 M = FixFreeIdx;
14095 else if (M >= 0 && M == FixFreeIdx)
14096 M = FixIdx;
14097 };
14098 if (NumFlippedBToBInputs != 0) {
14099 int BPinnedIdx =
14100 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14101 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14102 } else {
14103 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14104 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14105 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14106 }
14107 }
14108 }
14109
14110 int PSHUFDMask[] = {0, 1, 2, 3};
14111 PSHUFDMask[ADWord] = BDWord;
14112 PSHUFDMask[BDWord] = ADWord;
14113 V = DAG.getBitcast(
14114 VT,
14115 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14116 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14117
14118 // Adjust the mask to match the new locations of A and B.
14119 for (int &M : Mask)
14120 if (M >= 0 && M/2 == ADWord)
14121 M = 2 * BDWord + M % 2;
14122 else if (M >= 0 && M/2 == BDWord)
14123 M = 2 * ADWord + M % 2;
14124
14125 // Recurse back into this routine to re-compute state now that this isn't
14126 // a 3 and 1 problem.
14127 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14128 };
14129 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14130 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14131 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14132 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14133
14134 // At this point there are at most two inputs to the low and high halves from
14135 // each half. That means the inputs can always be grouped into dwords and
14136 // those dwords can then be moved to the correct half with a dword shuffle.
14137 // We use at most one low and one high word shuffle to collect these paired
14138 // inputs into dwords, and finally a dword shuffle to place them.
14139 int PSHUFLMask[4] = {-1, -1, -1, -1};
14140 int PSHUFHMask[4] = {-1, -1, -1, -1};
14141 int PSHUFDMask[4] = {-1, -1, -1, -1};
14142
14143 // First fix the masks for all the inputs that are staying in their
14144 // original halves. This will then dictate the targets of the cross-half
14145 // shuffles.
14146 auto fixInPlaceInputs =
14147 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14148 MutableArrayRef<int> SourceHalfMask,
14149 MutableArrayRef<int> HalfMask, int HalfOffset) {
14150 if (InPlaceInputs.empty())
14151 return;
14152 if (InPlaceInputs.size() == 1) {
14153 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14154 InPlaceInputs[0] - HalfOffset;
14155 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14156 return;
14157 }
14158 if (IncomingInputs.empty()) {
14159 // Just fix all of the in place inputs.
14160 for (int Input : InPlaceInputs) {
14161 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14162 PSHUFDMask[Input / 2] = Input / 2;
14163 }
14164 return;
14165 }
14166
14167 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14168 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14169 InPlaceInputs[0] - HalfOffset;
14170 // Put the second input next to the first so that they are packed into
14171 // a dword. We find the adjacent index by toggling the low bit.
14172 int AdjIndex = InPlaceInputs[0] ^ 1;
14173 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14174 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14175 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14176 };
14177 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14178 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14179
14180 // Now gather the cross-half inputs and place them into a free dword of
14181 // their target half.
14182 // FIXME: This operation could almost certainly be simplified dramatically to
14183 // look more like the 3-1 fixing operation.
14184 auto moveInputsToRightHalf = [&PSHUFDMask](
14185 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14186 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14187 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14188 int DestOffset) {
14189 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14190 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14191 };
14192 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14193 int Word) {
14194 int LowWord = Word & ~1;
14195 int HighWord = Word | 1;
14196 return isWordClobbered(SourceHalfMask, LowWord) ||
14197 isWordClobbered(SourceHalfMask, HighWord);
14198 };
14199
14200 if (IncomingInputs.empty())
14201 return;
14202
14203 if (ExistingInputs.empty()) {
14204 // Map any dwords with inputs from them into the right half.
14205 for (int Input : IncomingInputs) {
14206 // If the source half mask maps over the inputs, turn those into
14207 // swaps and use the swapped lane.
14208 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14209 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14210 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14211 Input - SourceOffset;
14212 // We have to swap the uses in our half mask in one sweep.
14213 for (int &M : HalfMask)
14214 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14215 M = Input;
14216 else if (M == Input)
14217 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14218 } else {
14219 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14220 Input - SourceOffset &&
14221 "Previous placement doesn't match!");
14222 }
14223 // Note that this correctly re-maps both when we do a swap and when
14224 // we observe the other side of the swap above. We rely on that to
14225 // avoid swapping the members of the input list directly.
14226 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14227 }
14228
14229 // Map the input's dword into the correct half.
14230 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14231 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14232 else
14233 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14234 Input / 2 &&
14235 "Previous placement doesn't match!");
14236 }
14237
14238 // And just directly shift any other-half mask elements to be same-half
14239 // as we will have mirrored the dword containing the element into the
14240 // same position within that half.
14241 for (int &M : HalfMask)
14242 if (M >= SourceOffset && M < SourceOffset + 4) {
14243 M = M - SourceOffset + DestOffset;
14244 assert(M >= 0 && "This should never wrap below zero!");
14245 }
14246 return;
14247 }
14248
14249 // Ensure we have the input in a viable dword of its current half. This
14250 // is particularly tricky because the original position may be clobbered
14251 // by inputs being moved and *staying* in that half.
14252 if (IncomingInputs.size() == 1) {
14253 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14254 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14255 SourceOffset;
14256 SourceHalfMask[InputFixed - SourceOffset] =
14257 IncomingInputs[0] - SourceOffset;
14258 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14259 IncomingInputs[0] = InputFixed;
14260 }
14261 } else if (IncomingInputs.size() == 2) {
14262 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14263 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14264 // We have two non-adjacent or clobbered inputs we need to extract from
14265 // the source half. To do this, we need to map them into some adjacent
14266 // dword slot in the source mask.
14267 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14268 IncomingInputs[1] - SourceOffset};
14269
14270 // If there is a free slot in the source half mask adjacent to one of
14271 // the inputs, place the other input in it. We use (Index XOR 1) to
14272 // compute an adjacent index.
14273 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14274 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14275 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14276 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14277 InputsFixed[1] = InputsFixed[0] ^ 1;
14278 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14279 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14280 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14281 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14282 InputsFixed[0] = InputsFixed[1] ^ 1;
14283 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14284 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14285 // The two inputs are in the same DWord but it is clobbered and the
14286 // adjacent DWord isn't used at all. Move both inputs to the free
14287 // slot.
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14289 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14290 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14291 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14292 } else {
14293 // The only way we hit this point is if there is no clobbering
14294 // (because there are no off-half inputs to this half) and there is no
14295 // free slot adjacent to one of the inputs. In this case, we have to
14296 // swap an input with a non-input.
14297 for (int i = 0; i < 4; ++i)
14298 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14299 "We can't handle any clobbers here!");
14300 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14301 "Cannot have adjacent inputs here!");
14302
14303 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14304 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14305
14306 // We also have to update the final source mask in this case because
14307 // it may need to undo the above swap.
14308 for (int &M : FinalSourceHalfMask)
14309 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14310 M = InputsFixed[1] + SourceOffset;
14311 else if (M == InputsFixed[1] + SourceOffset)
14312 M = (InputsFixed[0] ^ 1) + SourceOffset;
14313
14314 InputsFixed[1] = InputsFixed[0] ^ 1;
14315 }
14316
14317 // Point everything at the fixed inputs.
14318 for (int &M : HalfMask)
14319 if (M == IncomingInputs[0])
14320 M = InputsFixed[0] + SourceOffset;
14321 else if (M == IncomingInputs[1])
14322 M = InputsFixed[1] + SourceOffset;
14323
14324 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14325 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14326 }
14327 } else {
14328 llvm_unreachable("Unhandled input size!");
14329 }
14330
14331 // Now hoist the DWord down to the right half.
14332 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14333 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14334 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14335 for (int &M : HalfMask)
14336 for (int Input : IncomingInputs)
14337 if (M == Input)
14338 M = FreeDWord * 2 + Input % 2;
14339 };
14340 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14341 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14342 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14343 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14344
14345 // Now enact all the shuffles we've computed to move the inputs into their
14346 // target half.
14347 if (!isNoopShuffleMask(PSHUFLMask))
14348 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14349 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14350 if (!isNoopShuffleMask(PSHUFHMask))
14351 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14352 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14353 if (!isNoopShuffleMask(PSHUFDMask))
14354 V = DAG.getBitcast(
14355 VT,
14356 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14357 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14358
14359 // At this point, each half should contain all its inputs, and we can then
14360 // just shuffle them into their final position.
14361 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14362 "Failed to lift all the high half inputs to the low mask!");
14363 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14364 "Failed to lift all the low half inputs to the high mask!");
14365
14366 // Do a half shuffle for the low mask.
14367 if (!isNoopShuffleMask(LoMask))
14368 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14369 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14370
14371 // Do a half shuffle with the high mask after shifting its values down.
14372 for (int &M : HiMask)
14373 if (M >= 0)
14374 M -= 4;
14375 if (!isNoopShuffleMask(HiMask))
14376 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14377 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14378
14379 return V;
14380}
14381
14382/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14383/// blend if only one input is used.
14385 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14386 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14388 "Lane crossing shuffle masks not supported");
14389
14390 int NumBytes = VT.getSizeInBits() / 8;
14391 int Size = Mask.size();
14392 int Scale = NumBytes / Size;
14393
14394 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14396 V1InUse = false;
14397 V2InUse = false;
14398
14399 for (int i = 0; i < NumBytes; ++i) {
14400 int M = Mask[i / Scale];
14401 if (M < 0)
14402 continue;
14403
14404 const int ZeroMask = 0x80;
14405 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14406 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14407 if (Zeroable[i / Scale])
14408 V1Idx = V2Idx = ZeroMask;
14409
14410 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14411 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14412 V1InUse |= (ZeroMask != V1Idx);
14413 V2InUse |= (ZeroMask != V2Idx);
14414 }
14415
14416 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14417 if (V1InUse)
14418 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14419 DAG.getBuildVector(ShufVT, DL, V1Mask));
14420 if (V2InUse)
14421 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14422 DAG.getBuildVector(ShufVT, DL, V2Mask));
14423
14424 // If we need shuffled inputs from both, blend the two.
14425 SDValue V;
14426 if (V1InUse && V2InUse)
14427 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14428 else
14429 V = V1InUse ? V1 : V2;
14430
14431 // Cast the result back to the correct type.
14432 return DAG.getBitcast(VT, V);
14433}
14434
14435/// Generic lowering of 8-lane i16 shuffles.
14436///
14437/// This handles both single-input shuffles and combined shuffle/blends with
14438/// two inputs. The single input shuffles are immediately delegated to
14439/// a dedicated lowering routine.
14440///
14441/// The blends are lowered in one of three fundamental ways. If there are few
14442/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14443/// of the input is significantly cheaper when lowered as an interleaving of
14444/// the two inputs, try to interleave them. Otherwise, blend the low and high
14445/// halves of the inputs separately (making them have relatively few inputs)
14446/// and then concatenate them.
14448 const APInt &Zeroable, SDValue V1, SDValue V2,
14449 const X86Subtarget &Subtarget,
14450 SelectionDAG &DAG) {
14451 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14453 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14454
14455 // Whenever we can lower this as a zext, that instruction is strictly faster
14456 // than any alternative.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14467
14468 if (NumV2Inputs == 0) {
14469 // Try to use shift instructions.
14470 if (SDValue Shift =
14471 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14472 Subtarget, DAG, /*BitwiseOnly*/ false))
14473 return Shift;
14474
14475 // Check for being able to broadcast a single element.
14476 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14477 Mask, Subtarget, DAG))
14478 return Broadcast;
14479
14480 // Try to use bit rotation instructions.
14481 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14482 Subtarget, DAG))
14483 return Rotate;
14484
14485 // Use dedicated unpack instructions for masks that match their pattern.
14486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14487 return V;
14488
14489 // Use dedicated pack instructions for masks that match their pattern.
14490 if (SDValue V =
14491 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14492 return V;
14493
14494 // Try to use byte rotation instructions.
14495 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14496 Subtarget, DAG))
14497 return Rotate;
14498
14499 // Make a copy of the mask so it can be modified.
14500 SmallVector<int, 8> MutableMask(Mask);
14501 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14502 Subtarget, DAG);
14503 }
14504
14505 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14506 "All single-input shuffles should be canonicalized to be V1-input "
14507 "shuffles.");
14508
14509 // Try to use shift instructions.
14510 if (SDValue Shift =
14511 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14512 DAG, /*BitwiseOnly*/ false))
14513 return Shift;
14514
14515 // See if we can use SSE4A Extraction / Insertion.
14516 if (Subtarget.hasSSE4A())
14517 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14518 Zeroable, DAG))
14519 return V;
14520
14521 // There are special ways we can lower some single-element blends.
14522 if (NumV2Inputs == 1)
14524 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14525 return V;
14526
14527 // We have different paths for blend lowering, but they all must use the
14528 // *exact* same predicate.
14529 bool IsBlendSupported = Subtarget.hasSSE41();
14530 if (IsBlendSupported)
14531 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14532 Zeroable, Subtarget, DAG))
14533 return Blend;
14534
14535 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14536 Zeroable, Subtarget, DAG))
14537 return Masked;
14538
14539 // Use dedicated unpack instructions for masks that match their pattern.
14540 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14541 return V;
14542
14543 // Use dedicated pack instructions for masks that match their pattern.
14544 if (SDValue V =
14545 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14546 return V;
14547
14548 // Try to use lower using a truncation.
14549 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14550 Subtarget, DAG))
14551 return V;
14552
14553 // Try to use byte rotation instructions.
14554 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14555 Subtarget, DAG))
14556 return Rotate;
14557
14558 if (SDValue BitBlend =
14559 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14560 return BitBlend;
14561
14562 // Try to use byte shift instructions to mask.
14563 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14564 Zeroable, Subtarget, DAG))
14565 return V;
14566
14567 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14568 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14569 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14570 !Subtarget.hasVLX()) {
14571 // Check if this is part of a 256-bit vector truncation.
14572 unsigned PackOpc = 0;
14573 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14576 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14577 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14578 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14579 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14580 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14581 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14582 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14583 PackOpc = X86ISD::PACKUS;
14584 } else if (Subtarget.hasSSE41()) {
14585 SmallVector<SDValue, 4> DWordClearOps(4,
14586 DAG.getConstant(0, DL, MVT::i32));
14587 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14588 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14589 SDValue DWordClearMask =
14590 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14591 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14592 DWordClearMask);
14593 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14594 DWordClearMask);
14595 PackOpc = X86ISD::PACKUS;
14596 } else if (!Subtarget.hasSSSE3()) {
14597 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14598 V1 = DAG.getBitcast(MVT::v4i32, V1);
14599 V2 = DAG.getBitcast(MVT::v4i32, V2);
14600 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14601 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14602 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14603 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14604 PackOpc = X86ISD::PACKSS;
14605 }
14606 if (PackOpc) {
14607 // Now pack things back together.
14608 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14609 if (NumEvenDrops == 2) {
14610 Result = DAG.getBitcast(MVT::v4i32, Result);
14611 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14612 }
14613 return Result;
14614 }
14615 }
14616
14617 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14618 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14619 if (NumOddDrops == 1) {
14620 bool HasSSE41 = Subtarget.hasSSE41();
14621 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14622 DAG.getBitcast(MVT::v4i32, V1),
14623 DAG.getTargetConstant(16, DL, MVT::i8));
14624 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14625 DAG.getBitcast(MVT::v4i32, V2),
14626 DAG.getTargetConstant(16, DL, MVT::i8));
14627 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14628 MVT::v8i16, V1, V2);
14629 }
14630
14631 // Try to lower by permuting the inputs into an unpack instruction.
14632 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14633 Mask, Subtarget, DAG))
14634 return Unpack;
14635
14636 // If we can't directly blend but can use PSHUFB, that will be better as it
14637 // can both shuffle and set up the inefficient blend.
14638 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14639 bool V1InUse, V2InUse;
14640 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14641 Zeroable, DAG, V1InUse, V2InUse);
14642 }
14643
14644 // We can always bit-blend if we have to so the fallback strategy is to
14645 // decompose into single-input permutes and blends/unpacks.
14646 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14647 Zeroable, Subtarget, DAG);
14648}
14649
14650/// Lower 8-lane 16-bit floating point shuffles.
14652 const APInt &Zeroable, SDValue V1, SDValue V2,
14653 const X86Subtarget &Subtarget,
14654 SelectionDAG &DAG) {
14655 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14657 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14659
14660 if (Subtarget.hasFP16()) {
14661 if (NumV2Elements == 0) {
14662 // Check for being able to broadcast a single element.
14663 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14664 Mask, Subtarget, DAG))
14665 return Broadcast;
14666 }
14667 if (NumV2Elements == 1 && Mask[0] >= 8)
14669 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14670 return V;
14671 }
14672
14673 V1 = DAG.getBitcast(MVT::v8i16, V1);
14674 V2 = DAG.getBitcast(MVT::v8i16, V2);
14675 return DAG.getBitcast(MVT::v8f16,
14676 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14677}
14678
14679// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14680// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14681// the active subvector is extracted.
14683 ArrayRef<int> OriginalMask, SDValue V1,
14684 SDValue V2, const X86Subtarget &Subtarget,
14685 SelectionDAG &DAG) {
14686 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14687 SmallVector<int, 32> Mask(OriginalMask);
14688 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14689 !isShuffleFoldableLoad(V2)) {
14691 std::swap(V1, V2);
14692 }
14693
14694 MVT MaskVT = VT.changeTypeToInteger();
14695 SDValue MaskNode;
14696 MVT ShuffleVT = VT;
14697 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14698 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14699 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14700 ShuffleVT = V1.getSimpleValueType();
14701
14702 // Adjust mask to correct indices for the second input.
14703 int NumElts = VT.getVectorNumElements();
14704 unsigned Scale = 512 / VT.getSizeInBits();
14705 SmallVector<int, 32> AdjustedMask(Mask);
14706 for (int &M : AdjustedMask)
14707 if (NumElts <= M)
14708 M += (Scale - 1) * NumElts;
14709 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14710 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14711 } else {
14712 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14713 }
14714
14715 SDValue Result;
14716 if (V2.isUndef())
14717 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14718 else
14719 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14720
14721 if (VT != ShuffleVT)
14722 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14723
14724 return Result;
14725}
14726
14727/// Generic lowering of v16i8 shuffles.
14728///
14729/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14730/// detect any complexity reducing interleaving. If that doesn't help, it uses
14731/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14732/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14733/// back together.
14735 const APInt &Zeroable, SDValue V1, SDValue V2,
14736 const X86Subtarget &Subtarget,
14737 SelectionDAG &DAG) {
14738 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14740 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14741
14742 // Try to use shift instructions.
14743 if (SDValue Shift =
14744 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14745 DAG, /*BitwiseOnly*/ false))
14746 return Shift;
14747
14748 // Try to use byte rotation instructions.
14749 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14750 Subtarget, DAG))
14751 return Rotate;
14752
14753 // Use dedicated pack instructions for masks that match their pattern.
14754 if (SDValue V =
14755 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14756 return V;
14757
14758 // Try to use a zext lowering.
14759 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14760 Zeroable, Subtarget, DAG))
14761 return ZExt;
14762
14763 // Try to use lower using a truncation.
14764 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14765 Subtarget, DAG))
14766 return V;
14767
14768 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14769 Subtarget, DAG))
14770 return V;
14771
14772 // See if we can use SSE4A Extraction / Insertion.
14773 if (Subtarget.hasSSE4A())
14774 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14775 Zeroable, DAG))
14776 return V;
14777
14778 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14779
14780 // For single-input shuffles, there are some nicer lowering tricks we can use.
14781 if (NumV2Elements == 0) {
14782 // Check for being able to broadcast a single element.
14783 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14784 Mask, Subtarget, DAG))
14785 return Broadcast;
14786
14787 // Try to use bit rotation instructions.
14788 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14789 Subtarget, DAG))
14790 return Rotate;
14791
14792 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14793 return V;
14794
14795 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14796 // Notably, this handles splat and partial-splat shuffles more efficiently.
14797 // However, it only makes sense if the pre-duplication shuffle simplifies
14798 // things significantly. Currently, this means we need to be able to
14799 // express the pre-duplication shuffle as an i16 shuffle.
14800 //
14801 // FIXME: We should check for other patterns which can be widened into an
14802 // i16 shuffle as well.
14803 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14804 for (int i = 0; i < 16; i += 2)
14805 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14806 return false;
14807
14808 return true;
14809 };
14810 auto tryToWidenViaDuplication = [&]() -> SDValue {
14811 if (!canWidenViaDuplication(Mask))
14812 return SDValue();
14813 SmallVector<int, 4> LoInputs;
14814 copy_if(Mask, std::back_inserter(LoInputs),
14815 [](int M) { return M >= 0 && M < 8; });
14816 array_pod_sort(LoInputs.begin(), LoInputs.end());
14817 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14818 SmallVector<int, 4> HiInputs;
14819 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14820 array_pod_sort(HiInputs.begin(), HiInputs.end());
14821 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14822
14823 bool TargetLo = LoInputs.size() >= HiInputs.size();
14824 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14825 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14826
14827 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14829 for (int I : InPlaceInputs) {
14830 PreDupI16Shuffle[I/2] = I/2;
14831 LaneMap[I] = I;
14832 }
14833 int j = TargetLo ? 0 : 4, je = j + 4;
14834 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14835 // Check if j is already a shuffle of this input. This happens when
14836 // there are two adjacent bytes after we move the low one.
14837 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14838 // If we haven't yet mapped the input, search for a slot into which
14839 // we can map it.
14840 while (j < je && PreDupI16Shuffle[j] >= 0)
14841 ++j;
14842
14843 if (j == je)
14844 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14845 return SDValue();
14846
14847 // Map this input with the i16 shuffle.
14848 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14849 }
14850
14851 // Update the lane map based on the mapping we ended up with.
14852 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14853 }
14854 V1 = DAG.getBitcast(
14855 MVT::v16i8,
14856 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14857 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14858
14859 // Unpack the bytes to form the i16s that will be shuffled into place.
14860 bool EvenInUse = false, OddInUse = false;
14861 for (int i = 0; i < 16; i += 2) {
14862 EvenInUse |= (Mask[i + 0] >= 0);
14863 OddInUse |= (Mask[i + 1] >= 0);
14864 if (EvenInUse && OddInUse)
14865 break;
14866 }
14867 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14868 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14869 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14870
14871 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14872 for (int i = 0; i < 16; ++i)
14873 if (Mask[i] >= 0) {
14874 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14875 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14876 if (PostDupI16Shuffle[i / 2] < 0)
14877 PostDupI16Shuffle[i / 2] = MappedMask;
14878 else
14879 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14880 "Conflicting entries in the original shuffle!");
14881 }
14882 return DAG.getBitcast(
14883 MVT::v16i8,
14884 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14885 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14886 };
14887 if (SDValue V = tryToWidenViaDuplication())
14888 return V;
14889 }
14890
14891 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14892 Zeroable, Subtarget, DAG))
14893 return Masked;
14894
14895 // Use dedicated unpack instructions for masks that match their pattern.
14896 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14897 return V;
14898
14899 // Try to use byte shift instructions to mask.
14900 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14901 Zeroable, Subtarget, DAG))
14902 return V;
14903
14904 // Check for compaction patterns.
14905 bool IsSingleInput = V2.isUndef();
14906 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14907
14908 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14909 // with PSHUFB. It is important to do this before we attempt to generate any
14910 // blends but after all of the single-input lowerings. If the single input
14911 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14912 // want to preserve that and we can DAG combine any longer sequences into
14913 // a PSHUFB in the end. But once we start blending from multiple inputs,
14914 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14915 // and there are *very* few patterns that would actually be faster than the
14916 // PSHUFB approach because of its ability to zero lanes.
14917 //
14918 // If the mask is a binary compaction, we can more efficiently perform this
14919 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14920 //
14921 // FIXME: The only exceptions to the above are blends which are exact
14922 // interleavings with direct instructions supporting them. We currently don't
14923 // handle those well here.
14924 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14925 bool V1InUse = false;
14926 bool V2InUse = false;
14927
14929 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14930
14931 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14932 // do so. This avoids using them to handle blends-with-zero which is
14933 // important as a single pshufb is significantly faster for that.
14934 if (V1InUse && V2InUse) {
14935 if (Subtarget.hasSSE41())
14936 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14937 Zeroable, Subtarget, DAG))
14938 return Blend;
14939
14940 // We can use an unpack to do the blending rather than an or in some
14941 // cases. Even though the or may be (very minorly) more efficient, we
14942 // preference this lowering because there are common cases where part of
14943 // the complexity of the shuffles goes away when we do the final blend as
14944 // an unpack.
14945 // FIXME: It might be worth trying to detect if the unpack-feeding
14946 // shuffles will both be pshufb, in which case we shouldn't bother with
14947 // this.
14949 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14950 return Unpack;
14951
14952 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14953 if (Subtarget.hasVBMI())
14954 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14955 DAG);
14956
14957 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14958 if (Subtarget.hasXOP()) {
14959 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14960 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14961 }
14962
14963 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14964 // PALIGNR will be cheaper than the second PSHUFB+OR.
14966 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14967 return V;
14968 }
14969
14970 return PSHUFB;
14971 }
14972
14973 // There are special ways we can lower some single-element blends.
14974 if (NumV2Elements == 1)
14976 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14977 return V;
14978
14979 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14980 return Blend;
14981
14982 // Check whether a compaction lowering can be done. This handles shuffles
14983 // which take every Nth element for some even N. See the helper function for
14984 // details.
14985 //
14986 // We special case these as they can be particularly efficiently handled with
14987 // the PACKUSB instruction on x86 and they show up in common patterns of
14988 // rearranging bytes to truncate wide elements.
14989 if (NumEvenDrops) {
14990 // NumEvenDrops is the power of two stride of the elements. Another way of
14991 // thinking about it is that we need to drop the even elements this many
14992 // times to get the original input.
14993
14994 // First we need to zero all the dropped bytes.
14995 assert(NumEvenDrops <= 3 &&
14996 "No support for dropping even elements more than 3 times.");
14997 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14998 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14999 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15000 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15001 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15002 WordClearMask);
15003 if (!IsSingleInput)
15004 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15005 WordClearMask);
15006
15007 // Now pack things back together.
15008 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15009 IsSingleInput ? V1 : V2);
15010 for (int i = 1; i < NumEvenDrops; ++i) {
15011 Result = DAG.getBitcast(MVT::v8i16, Result);
15012 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15013 }
15014 return Result;
15015 }
15016
15017 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15018 if (NumOddDrops == 1) {
15019 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15020 DAG.getBitcast(MVT::v8i16, V1),
15021 DAG.getTargetConstant(8, DL, MVT::i8));
15022 if (!IsSingleInput)
15023 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15024 DAG.getBitcast(MVT::v8i16, V2),
15025 DAG.getTargetConstant(8, DL, MVT::i8));
15026 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15027 IsSingleInput ? V1 : V2);
15028 }
15029
15030 // Handle multi-input cases by blending/unpacking single-input shuffles.
15031 if (NumV2Elements > 0)
15032 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15033 Zeroable, Subtarget, DAG);
15034
15035 // The fallback path for single-input shuffles widens this into two v8i16
15036 // vectors with unpacks, shuffles those, and then pulls them back together
15037 // with a pack.
15038 SDValue V = V1;
15039
15040 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15042 for (int i = 0; i < 16; ++i)
15043 if (Mask[i] >= 0)
15044 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15045
15046 SDValue VLoHalf, VHiHalf;
15047 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15048 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15049 // i16s.
15050 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15051 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15052 // Use a mask to drop the high bytes.
15053 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15054 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15055 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15056
15057 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15058 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15059
15060 // Squash the masks to point directly into VLoHalf.
15061 for (int &M : LoBlendMask)
15062 if (M >= 0)
15063 M /= 2;
15064 for (int &M : HiBlendMask)
15065 if (M >= 0)
15066 M /= 2;
15067 } else {
15068 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15069 // VHiHalf so that we can blend them as i16s.
15070 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15071
15072 VLoHalf = DAG.getBitcast(
15073 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15074 VHiHalf = DAG.getBitcast(
15075 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15076 }
15077
15078 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15079 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15080
15081 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15082}
15083
15084/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15085///
15086/// This routine breaks down the specific type of 128-bit shuffle and
15087/// dispatches to the lowering routines accordingly.
15089 MVT VT, SDValue V1, SDValue V2,
15090 const APInt &Zeroable,
15091 const X86Subtarget &Subtarget,
15092 SelectionDAG &DAG) {
15093 if (VT == MVT::v8bf16) {
15094 V1 = DAG.getBitcast(MVT::v8i16, V1);
15095 V2 = DAG.getBitcast(MVT::v8i16, V2);
15096 return DAG.getBitcast(VT,
15097 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15098 }
15099
15100 switch (VT.SimpleTy) {
15101 case MVT::v2i64:
15102 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15103 case MVT::v2f64:
15104 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v4i32:
15106 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v4f32:
15108 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v8i16:
15110 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v8f16:
15112 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v16i8:
15114 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115
15116 default:
15117 llvm_unreachable("Unimplemented!");
15118 }
15119}
15120
15121/// Generic routine to split vector shuffle into half-sized shuffles.
15122///
15123/// This routine just extracts two subvectors, shuffles them independently, and
15124/// then concatenates them back together. This should work effectively with all
15125/// AVX vector shuffle types.
15127 SDValue V2, ArrayRef<int> Mask,
15128 SelectionDAG &DAG, bool SimpleOnly) {
15129 assert(VT.getSizeInBits() >= 256 &&
15130 "Only for 256-bit or wider vector shuffles!");
15131 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15132 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15133
15134 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15135 if (VT == MVT::v8f32) {
15136 SDValue BC1 = peekThroughBitcasts(V1);
15137 SDValue BC2 = peekThroughBitcasts(V2);
15138 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15139 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15140 DAG, SimpleOnly))
15141 return DAG.getBitcast(VT, Split);
15142 }
15143 }
15144
15145 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15146 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15147
15148 int NumElements = VT.getVectorNumElements();
15149 int SplitNumElements = NumElements / 2;
15150 MVT ScalarVT = VT.getVectorElementType();
15151 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15152
15153 // Use splitVector/extractSubVector so that split build-vectors just build two
15154 // narrower build vectors. This helps shuffling with splats and zeros.
15155 auto SplitVector = [&](SDValue V) {
15156 SDValue LoV, HiV;
15157 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15158 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15159 DAG.getBitcast(SplitVT, HiV));
15160 };
15161
15162 SDValue LoV1, HiV1, LoV2, HiV2;
15163 std::tie(LoV1, HiV1) = SplitVector(V1);
15164 std::tie(LoV2, HiV2) = SplitVector(V2);
15165
15166 // Now create two 4-way blends of these half-width vectors.
15167 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15168 bool &UseHiV1, bool &UseLoV2,
15169 bool &UseHiV2) {
15170 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15171 for (int i = 0; i < SplitNumElements; ++i) {
15172 int M = HalfMask[i];
15173 if (M >= NumElements) {
15174 if (M >= NumElements + SplitNumElements)
15175 UseHiV2 = true;
15176 else
15177 UseLoV2 = true;
15178 } else if (M >= 0) {
15179 if (M >= SplitNumElements)
15180 UseHiV1 = true;
15181 else
15182 UseLoV1 = true;
15183 }
15184 }
15185 };
15186
15187 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15188 if (!SimpleOnly)
15189 return true;
15190
15191 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15192 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15193
15194 return !(UseHiV1 || UseHiV2);
15195 };
15196
15197 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15198 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15200 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15201 for (int i = 0; i < SplitNumElements; ++i) {
15202 int M = HalfMask[i];
15203 if (M >= NumElements) {
15204 V2BlendMask[i] = M - NumElements;
15205 BlendMask[i] = SplitNumElements + i;
15206 } else if (M >= 0) {
15207 V1BlendMask[i] = M;
15208 BlendMask[i] = i;
15209 }
15210 }
15211
15212 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15213 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15214
15215 // Because the lowering happens after all combining takes place, we need to
15216 // manually combine these blend masks as much as possible so that we create
15217 // a minimal number of high-level vector shuffle nodes.
15218 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15219
15220 // First try just blending the halves of V1 or V2.
15221 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15222 return DAG.getUNDEF(SplitVT);
15223 if (!UseLoV2 && !UseHiV2)
15224 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15225 if (!UseLoV1 && !UseHiV1)
15226 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15227
15228 SDValue V1Blend, V2Blend;
15229 if (UseLoV1 && UseHiV1) {
15230 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15231 } else {
15232 // We only use half of V1 so map the usage down into the final blend mask.
15233 V1Blend = UseLoV1 ? LoV1 : HiV1;
15234 for (int i = 0; i < SplitNumElements; ++i)
15235 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15236 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15237 }
15238 if (UseLoV2 && UseHiV2) {
15239 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15240 } else {
15241 // We only use half of V2 so map the usage down into the final blend mask.
15242 V2Blend = UseLoV2 ? LoV2 : HiV2;
15243 for (int i = 0; i < SplitNumElements; ++i)
15244 if (BlendMask[i] >= SplitNumElements)
15245 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15246 }
15247 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15248 };
15249
15250 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15251 return SDValue();
15252
15253 SDValue Lo = HalfBlend(LoMask);
15254 SDValue Hi = HalfBlend(HiMask);
15255 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15256}
15257
15258/// Either split a vector in halves or decompose the shuffles and the
15259/// blend/unpack.
15260///
15261/// This is provided as a good fallback for many lowerings of non-single-input
15262/// shuffles with more than one 128-bit lane. In those cases, we want to select
15263/// between splitting the shuffle into 128-bit components and stitching those
15264/// back together vs. extracting the single-input shuffles and blending those
15265/// results.
15267 SDValue V2, ArrayRef<int> Mask,
15268 const APInt &Zeroable,
15269 const X86Subtarget &Subtarget,
15270 SelectionDAG &DAG) {
15271 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15272 "shuffles as it could then recurse on itself.");
15273 int Size = Mask.size();
15274
15275 // If this can be modeled as a broadcast of two elements followed by a blend,
15276 // prefer that lowering. This is especially important because broadcasts can
15277 // often fold with memory operands.
15278 auto DoBothBroadcast = [&] {
15279 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15280 for (int M : Mask)
15281 if (M >= Size) {
15282 if (V2BroadcastIdx < 0)
15283 V2BroadcastIdx = M - Size;
15284 else if ((M - Size) != V2BroadcastIdx &&
15285 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15286 return false;
15287 } else if (M >= 0) {
15288 if (V1BroadcastIdx < 0)
15289 V1BroadcastIdx = M;
15290 else if (M != V1BroadcastIdx &&
15291 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15292 return false;
15293 }
15294 return true;
15295 };
15296 if (DoBothBroadcast())
15297 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15298 Subtarget, DAG);
15299
15300 // If the inputs all stem from a single 128-bit lane of each input, then we
15301 // split them rather than blending because the split will decompose to
15302 // unusually few instructions.
15303 int LaneCount = VT.getSizeInBits() / 128;
15304 int LaneSize = Size / LaneCount;
15305 SmallBitVector LaneInputs[2];
15306 LaneInputs[0].resize(LaneCount, false);
15307 LaneInputs[1].resize(LaneCount, false);
15308 for (int i = 0; i < Size; ++i)
15309 if (Mask[i] >= 0)
15310 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15311 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15312 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15313 /*SimpleOnly*/ false);
15314
15315 // Without AVX2, if we can freely split the subvectors then we're better off
15316 // performing half width shuffles.
15317 if (!Subtarget.hasAVX2()) {
15318 SDValue BC1 = peekThroughBitcasts(V1);
15319 SDValue BC2 = peekThroughBitcasts(V2);
15320 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15321 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15322 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15323 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15324 if (SplatOrSplitV1 && SplatOrSplitV2)
15325 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15326 /*SimpleOnly*/ false);
15327 }
15328
15329 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15330 // requires that the decomposed single-input shuffles don't end up here.
15331 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15332 Subtarget, DAG);
15333}
15334
15335// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15336// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15338 SDValue V1, SDValue V2,
15339 ArrayRef<int> Mask,
15340 SelectionDAG &DAG) {
15341 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15342
15343 int LHSMask[4] = {-1, -1, -1, -1};
15344 int RHSMask[4] = {-1, -1, -1, -1};
15345 int SHUFPDMask[4] = {-1, -1, -1, -1};
15346
15347 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15348 // perform the shuffle once the lanes have been shuffled in place.
15349 for (int i = 0; i != 4; ++i) {
15350 int M = Mask[i];
15351 if (M < 0)
15352 continue;
15353 int LaneBase = i & ~1;
15354 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15355 LaneMask[LaneBase + (M & 1)] = M;
15356 SHUFPDMask[i] = M & 1;
15357 }
15358
15359 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15360 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15361 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15362 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15363}
15364
15365/// Lower a vector shuffle crossing multiple 128-bit lanes as
15366/// a lane permutation followed by a per-lane permutation.
15367///
15368/// This is mainly for cases where we can have non-repeating permutes
15369/// in each lane.
15370///
15371/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15372/// we should investigate merging them.
15374 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15375 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15376 int NumElts = VT.getVectorNumElements();
15377 int NumLanes = VT.getSizeInBits() / 128;
15378 int NumEltsPerLane = NumElts / NumLanes;
15379 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15380
15381 /// Attempts to find a sublane permute with the given size
15382 /// that gets all elements into their target lanes.
15383 ///
15384 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15385 /// If unsuccessful, returns false and may overwrite InLaneMask.
15386 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15387 int NumSublanesPerLane = NumSublanes / NumLanes;
15388 int NumEltsPerSublane = NumElts / NumSublanes;
15389
15390 SmallVector<int, 16> CrossLaneMask;
15391 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15392 // CrossLaneMask but one entry == one sublane.
15393 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15394 APInt DemandedCrossLane = APInt::getZero(NumElts);
15395
15396 for (int i = 0; i != NumElts; ++i) {
15397 int M = Mask[i];
15398 if (M < 0)
15399 continue;
15400
15401 int SrcSublane = M / NumEltsPerSublane;
15402 int DstLane = i / NumEltsPerLane;
15403
15404 // We only need to get the elements into the right lane, not sublane.
15405 // So search all sublanes that make up the destination lane.
15406 bool Found = false;
15407 int DstSubStart = DstLane * NumSublanesPerLane;
15408 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15409 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15410 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15411 continue;
15412
15413 Found = true;
15414 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15415 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15416 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15417 DemandedCrossLane.setBit(InLaneMask[i]);
15418 break;
15419 }
15420 if (!Found)
15421 return SDValue();
15422 }
15423
15424 // Fill CrossLaneMask using CrossLaneMaskLarge.
15425 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15426
15427 if (!CanUseSublanes) {
15428 // If we're only shuffling a single lowest lane and the rest are identity
15429 // then don't bother.
15430 // TODO - isShuffleMaskInputInPlace could be extended to something like
15431 // this.
15432 int NumIdentityLanes = 0;
15433 bool OnlyShuffleLowestLane = true;
15434 for (int i = 0; i != NumLanes; ++i) {
15435 int LaneOffset = i * NumEltsPerLane;
15436 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15437 i * NumEltsPerLane))
15438 NumIdentityLanes++;
15439 else if (CrossLaneMask[LaneOffset] != 0)
15440 OnlyShuffleLowestLane = false;
15441 }
15442 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15443 return SDValue();
15444 }
15445
15446 // Simplify CrossLaneMask based on the actual demanded elements.
15447 if (V1.hasOneUse())
15448 for (int i = 0; i != NumElts; ++i)
15449 if (!DemandedCrossLane[i])
15450 CrossLaneMask[i] = SM_SentinelUndef;
15451
15452 // Avoid returning the same shuffle operation. For example,
15453 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15454 // undef:v16i16
15455 if (CrossLaneMask == Mask || InLaneMask == Mask)
15456 return SDValue();
15457
15458 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15459 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15460 InLaneMask);
15461 };
15462
15463 // First attempt a solution with full lanes.
15464 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15465 return V;
15466
15467 // The rest of the solutions use sublanes.
15468 if (!CanUseSublanes)
15469 return SDValue();
15470
15471 // Then attempt a solution with 64-bit sublanes (vpermq).
15472 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15473 return V;
15474
15475 // If that doesn't work and we have fast variable cross-lane shuffle,
15476 // attempt 32-bit sublanes (vpermd).
15477 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15478 return SDValue();
15479
15480 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15481}
15482
15483/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15484static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15485 SmallVector<int> &InLaneMask) {
15486 int Size = Mask.size();
15487 InLaneMask.assign(Mask.begin(), Mask.end());
15488 for (int i = 0; i < Size; ++i) {
15489 int &M = InLaneMask[i];
15490 if (M < 0)
15491 continue;
15492 if (((M % Size) / LaneSize) != (i / LaneSize))
15493 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15494 }
15495}
15496
15497/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15498/// source with a lane permutation.
15499///
15500/// This lowering strategy results in four instructions in the worst case for a
15501/// single-input cross lane shuffle which is lower than any other fully general
15502/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15503/// shuffle pattern should be handled prior to trying this lowering.
15505 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15506 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15507 // FIXME: This should probably be generalized for 512-bit vectors as well.
15508 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15509 int Size = Mask.size();
15510 int LaneSize = Size / 2;
15511
15512 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15513 // Only do this if the elements aren't all from the lower lane,
15514 // otherwise we're (probably) better off doing a split.
15515 if (VT == MVT::v4f64 &&
15516 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15517 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15518
15519 // If there are only inputs from one 128-bit lane, splitting will in fact be
15520 // less expensive. The flags track whether the given lane contains an element
15521 // that crosses to another lane.
15522 bool AllLanes;
15523 if (!Subtarget.hasAVX2()) {
15524 bool LaneCrossing[2] = {false, false};
15525 for (int i = 0; i < Size; ++i)
15526 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15527 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15528 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15529 } else {
15530 bool LaneUsed[2] = {false, false};
15531 for (int i = 0; i < Size; ++i)
15532 if (Mask[i] >= 0)
15533 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15534 AllLanes = LaneUsed[0] && LaneUsed[1];
15535 }
15536
15537 // TODO - we could support shuffling V2 in the Flipped input.
15538 assert(V2.isUndef() &&
15539 "This last part of this routine only works on single input shuffles");
15540
15541 SmallVector<int> InLaneMask;
15542 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15543
15544 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15545 "In-lane shuffle mask expected");
15546
15547 // If we're not using both lanes in each lane and the inlane mask is not
15548 // repeating, then we're better off splitting.
15549 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15550 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15551 /*SimpleOnly*/ false);
15552
15553 // Flip the lanes, and shuffle the results which should now be in-lane.
15554 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15555 SDValue Flipped = DAG.getBitcast(PVT, V1);
15556 Flipped =
15557 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15558 Flipped = DAG.getBitcast(VT, Flipped);
15559 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15560}
15561
15562/// Handle lowering 2-lane 128-bit shuffles.
15564 SDValue V2, ArrayRef<int> Mask,
15565 const APInt &Zeroable,
15566 const X86Subtarget &Subtarget,
15567 SelectionDAG &DAG) {
15568 if (V2.isUndef()) {
15569 // Attempt to match VBROADCAST*128 subvector broadcast load.
15570 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15571 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15572 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15574 MVT MemVT = VT.getHalfNumVectorElementsVT();
15575 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15578 VT, MemVT, Ld, Ofs, DAG))
15579 return BcstLd;
15580 }
15581
15582 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15583 if (Subtarget.hasAVX2())
15584 return SDValue();
15585 }
15586
15587 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15588
15589 SmallVector<int, 4> WidenedMask;
15590 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15591 return SDValue();
15592
15593 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15594 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15595
15596 // Try to use an insert into a zero vector.
15597 if (WidenedMask[0] == 0 && IsHighZero) {
15598 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15599 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15600 DAG.getVectorIdxConstant(0, DL));
15601 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15602 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15603 DAG.getVectorIdxConstant(0, DL));
15604 }
15605
15606 // TODO: If minimizing size and one of the inputs is a zero vector and the
15607 // the zero vector has only one use, we could use a VPERM2X128 to save the
15608 // instruction bytes needed to explicitly generate the zero vector.
15609
15610 // Blends are faster and handle all the non-lane-crossing cases.
15611 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15612 Subtarget, DAG))
15613 return Blend;
15614
15615 // If either input operand is a zero vector, use VPERM2X128 because its mask
15616 // allows us to replace the zero input with an implicit zero.
15617 if (!IsLowZero && !IsHighZero) {
15618 // Check for patterns which can be matched with a single insert of a 128-bit
15619 // subvector.
15620 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15621 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15622
15623 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15624 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15626 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15627 SDValue SubVec =
15628 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15629 DAG.getVectorIdxConstant(0, DL));
15630 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15631 DAG.getVectorIdxConstant(2, DL));
15632 }
15633 }
15634
15635 // Try to use SHUF128 if possible.
15636 if (Subtarget.hasVLX()) {
15637 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15638 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15639 ((WidenedMask[1] % 2) << 1);
15640 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15641 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15642 }
15643 }
15644 }
15645
15646 // Otherwise form a 128-bit permutation. After accounting for undefs,
15647 // convert the 64-bit shuffle mask selection values into 128-bit
15648 // selection bits by dividing the indexes by 2 and shifting into positions
15649 // defined by a vperm2*128 instruction's immediate control byte.
15650
15651 // The immediate permute control byte looks like this:
15652 // [1:0] - select 128 bits from sources for low half of destination
15653 // [2] - ignore
15654 // [3] - zero low half of destination
15655 // [5:4] - select 128 bits from sources for high half of destination
15656 // [6] - ignore
15657 // [7] - zero high half of destination
15658
15659 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15660 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15661
15662 unsigned PermMask = 0;
15663 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15664 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15665
15666 // Check the immediate mask and replace unused sources with undef.
15667 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15668 V1 = DAG.getUNDEF(VT);
15669 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15670 V2 = DAG.getUNDEF(VT);
15671
15672 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15673 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15674}
15675
15676/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15677/// shuffling each lane.
15678///
15679/// This attempts to create a repeated lane shuffle where each lane uses one
15680/// or two of the lanes of the inputs. The lanes of the input vectors are
15681/// shuffled in one or two independent shuffles to get the lanes into the
15682/// position needed by the final shuffle.
15684 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15685 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15686 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15687
15688 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15689 return SDValue();
15690
15691 int NumElts = Mask.size();
15692 int NumLanes = VT.getSizeInBits() / 128;
15693 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15694 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15695 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15696
15697 // First pass will try to fill in the RepeatMask from lanes that need two
15698 // sources.
15699 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15700 int Srcs[2] = {-1, -1};
15701 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15702 for (int i = 0; i != NumLaneElts; ++i) {
15703 int M = Mask[(Lane * NumLaneElts) + i];
15704 if (M < 0)
15705 continue;
15706 // Determine which of the possible input lanes (NumLanes from each source)
15707 // this element comes from. Assign that as one of the sources for this
15708 // lane. We can assign up to 2 sources for this lane. If we run out
15709 // sources we can't do anything.
15710 int LaneSrc = M / NumLaneElts;
15711 int Src;
15712 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15713 Src = 0;
15714 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15715 Src = 1;
15716 else
15717 return SDValue();
15718
15719 Srcs[Src] = LaneSrc;
15720 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15721 }
15722
15723 // If this lane has two sources, see if it fits with the repeat mask so far.
15724 if (Srcs[1] < 0)
15725 continue;
15726
15727 LaneSrcs[Lane][0] = Srcs[0];
15728 LaneSrcs[Lane][1] = Srcs[1];
15729
15730 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15731 assert(M1.size() == M2.size() && "Unexpected mask size");
15732 for (int i = 0, e = M1.size(); i != e; ++i)
15733 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15734 return false;
15735 return true;
15736 };
15737
15738 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15739 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15740 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15741 int M = Mask[i];
15742 if (M < 0)
15743 continue;
15744 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15745 "Unexpected mask element");
15746 MergedMask[i] = M;
15747 }
15748 };
15749
15750 if (MatchMasks(InLaneMask, RepeatMask)) {
15751 // Merge this lane mask into the final repeat mask.
15752 MergeMasks(InLaneMask, RepeatMask);
15753 continue;
15754 }
15755
15756 // Didn't find a match. Swap the operands and try again.
15757 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15759
15760 if (MatchMasks(InLaneMask, RepeatMask)) {
15761 // Merge this lane mask into the final repeat mask.
15762 MergeMasks(InLaneMask, RepeatMask);
15763 continue;
15764 }
15765
15766 // Couldn't find a match with the operands in either order.
15767 return SDValue();
15768 }
15769
15770 // Now handle any lanes with only one source.
15771 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15772 // If this lane has already been processed, skip it.
15773 if (LaneSrcs[Lane][0] >= 0)
15774 continue;
15775
15776 for (int i = 0; i != NumLaneElts; ++i) {
15777 int M = Mask[(Lane * NumLaneElts) + i];
15778 if (M < 0)
15779 continue;
15780
15781 // If RepeatMask isn't defined yet we can define it ourself.
15782 if (RepeatMask[i] < 0)
15783 RepeatMask[i] = M % NumLaneElts;
15784
15785 if (RepeatMask[i] < NumElts) {
15786 if (RepeatMask[i] != M % NumLaneElts)
15787 return SDValue();
15788 LaneSrcs[Lane][0] = M / NumLaneElts;
15789 } else {
15790 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15791 return SDValue();
15792 LaneSrcs[Lane][1] = M / NumLaneElts;
15793 }
15794 }
15795
15796 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15797 return SDValue();
15798 }
15799
15800 SmallVector<int, 16> NewMask(NumElts, -1);
15801 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15802 int Src = LaneSrcs[Lane][0];
15803 for (int i = 0; i != NumLaneElts; ++i) {
15804 int M = -1;
15805 if (Src >= 0)
15806 M = Src * NumLaneElts + i;
15807 NewMask[Lane * NumLaneElts + i] = M;
15808 }
15809 }
15810 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15811 // Ensure we didn't get back the shuffle we started with.
15812 // FIXME: This is a hack to make up for some splat handling code in
15813 // getVectorShuffle.
15814 if (isa<ShuffleVectorSDNode>(NewV1) &&
15815 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15816 return SDValue();
15817
15818 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15819 int Src = LaneSrcs[Lane][1];
15820 for (int i = 0; i != NumLaneElts; ++i) {
15821 int M = -1;
15822 if (Src >= 0)
15823 M = Src * NumLaneElts + i;
15824 NewMask[Lane * NumLaneElts + i] = M;
15825 }
15826 }
15827 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15828 // Ensure we didn't get back the shuffle we started with.
15829 // FIXME: This is a hack to make up for some splat handling code in
15830 // getVectorShuffle.
15831 if (isa<ShuffleVectorSDNode>(NewV2) &&
15832 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15833 return SDValue();
15834
15835 for (int i = 0; i != NumElts; ++i) {
15836 if (Mask[i] < 0) {
15837 NewMask[i] = -1;
15838 continue;
15839 }
15840 NewMask[i] = RepeatMask[i % NumLaneElts];
15841 if (NewMask[i] < 0)
15842 continue;
15843
15844 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15845 }
15846 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15847}
15848
15849/// If the input shuffle mask results in a vector that is undefined in all upper
15850/// or lower half elements and that mask accesses only 2 halves of the
15851/// shuffle's operands, return true. A mask of half the width with mask indexes
15852/// adjusted to access the extracted halves of the original shuffle operands is
15853/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15854/// lower half of each input operand is accessed.
15855static bool
15857 int &HalfIdx1, int &HalfIdx2) {
15858 assert((Mask.size() == HalfMask.size() * 2) &&
15859 "Expected input mask to be twice as long as output");
15860
15861 // Exactly one half of the result must be undef to allow narrowing.
15862 bool UndefLower = isUndefLowerHalf(Mask);
15863 bool UndefUpper = isUndefUpperHalf(Mask);
15864 if (UndefLower == UndefUpper)
15865 return false;
15866
15867 unsigned HalfNumElts = HalfMask.size();
15868 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15869 HalfIdx1 = -1;
15870 HalfIdx2 = -1;
15871 for (unsigned i = 0; i != HalfNumElts; ++i) {
15872 int M = Mask[i + MaskIndexOffset];
15873 if (M < 0) {
15874 HalfMask[i] = M;
15875 continue;
15876 }
15877
15878 // Determine which of the 4 half vectors this element is from.
15879 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15880 int HalfIdx = M / HalfNumElts;
15881
15882 // Determine the element index into its half vector source.
15883 int HalfElt = M % HalfNumElts;
15884
15885 // We can shuffle with up to 2 half vectors, set the new 'half'
15886 // shuffle mask accordingly.
15887 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15888 HalfMask[i] = HalfElt;
15889 HalfIdx1 = HalfIdx;
15890 continue;
15891 }
15892 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15893 HalfMask[i] = HalfElt + HalfNumElts;
15894 HalfIdx2 = HalfIdx;
15895 continue;
15896 }
15897
15898 // Too many half vectors referenced.
15899 return false;
15900 }
15901
15902 return true;
15903}
15904
15905/// Given the output values from getHalfShuffleMask(), create a half width
15906/// shuffle of extracted vectors followed by an insert back to full width.
15908 ArrayRef<int> HalfMask, int HalfIdx1,
15909 int HalfIdx2, bool UndefLower,
15910 SelectionDAG &DAG, bool UseConcat = false) {
15911 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15912 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15913
15914 MVT VT = V1.getSimpleValueType();
15915 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15916 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15917
15918 auto getHalfVector = [&](int HalfIdx) {
15919 if (HalfIdx < 0)
15920 return DAG.getUNDEF(HalfVT);
15921 SDValue V = (HalfIdx < 2 ? V1 : V2);
15922 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15923 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15924 DAG.getVectorIdxConstant(HalfIdx, DL));
15925 };
15926
15927 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15928 SDValue Half1 = getHalfVector(HalfIdx1);
15929 SDValue Half2 = getHalfVector(HalfIdx2);
15930 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15931 if (UseConcat) {
15932 SDValue Op0 = V;
15933 SDValue Op1 = DAG.getUNDEF(HalfVT);
15934 if (UndefLower)
15935 std::swap(Op0, Op1);
15936 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15937 }
15938
15939 unsigned Offset = UndefLower ? HalfNumElts : 0;
15940 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15942}
15943
15944/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15945/// This allows for fast cases such as subvector extraction/insertion
15946/// or shuffling smaller vector types which can lower more efficiently.
15948 SDValue V2, ArrayRef<int> Mask,
15949 const X86Subtarget &Subtarget,
15950 SelectionDAG &DAG) {
15951 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15952 "Expected 256-bit or 512-bit vector");
15953
15954 bool UndefLower = isUndefLowerHalf(Mask);
15955 if (!UndefLower && !isUndefUpperHalf(Mask))
15956 return SDValue();
15957
15958 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15959 "Completely undef shuffle mask should have been simplified already");
15960
15961 // Upper half is undef and lower half is whole upper subvector.
15962 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15963 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15964 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15965 if (!UndefLower &&
15966 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15967 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15968 DAG.getVectorIdxConstant(HalfNumElts, DL));
15969 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15970 DAG.getVectorIdxConstant(0, DL));
15971 }
15972
15973 // Lower half is undef and upper half is whole lower subvector.
15974 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15975 if (UndefLower &&
15976 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15977 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15978 DAG.getVectorIdxConstant(0, DL));
15979 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15980 DAG.getVectorIdxConstant(HalfNumElts, DL));
15981 }
15982
15983 int HalfIdx1, HalfIdx2;
15984 SmallVector<int, 8> HalfMask(HalfNumElts);
15985 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15986 return SDValue();
15987
15988 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15989
15990 // Only shuffle the halves of the inputs when useful.
15991 unsigned NumLowerHalves =
15992 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15993 unsigned NumUpperHalves =
15994 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15995 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15996
15997 // Determine the larger pattern of undef/halves, then decide if it's worth
15998 // splitting the shuffle based on subtarget capabilities and types.
15999 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16000 if (!UndefLower) {
16001 // XXXXuuuu: no insert is needed.
16002 // Always extract lowers when setting lower - these are all free subreg ops.
16003 if (NumUpperHalves == 0)
16004 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16005 UndefLower, DAG);
16006
16007 if (NumUpperHalves == 1) {
16008 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16009 if (Subtarget.hasAVX2()) {
16010 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16011 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16012 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16013 (!isSingleSHUFPSMask(HalfMask) ||
16014 Subtarget.hasFastVariableCrossLaneShuffle()))
16015 return SDValue();
16016 // If this is an unary shuffle (assume that the 2nd operand is
16017 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16018 // are better off extracting the upper half of 1 operand and using a
16019 // narrow shuffle.
16020 if (EltWidth == 64 && V2.isUndef())
16021 return SDValue();
16022 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16023 // full width pshufb, and then merge.
16024 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16025 return SDValue();
16026 }
16027 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16028 if (Subtarget.hasAVX512() && VT.is512BitVector())
16029 return SDValue();
16030 // Extract + narrow shuffle is better than the wide alternative.
16031 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16032 UndefLower, DAG);
16033 }
16034
16035 // Don't extract both uppers, instead shuffle and then extract.
16036 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16037 return SDValue();
16038 }
16039
16040 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16041 if (NumUpperHalves == 0) {
16042 // AVX2 has efficient 64-bit element cross-lane shuffles.
16043 // TODO: Refine to account for unary shuffle, splat, and other masks?
16044 if (Subtarget.hasAVX2() && EltWidth == 64)
16045 return SDValue();
16046 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16047 if (Subtarget.hasAVX512() && VT.is512BitVector())
16048 return SDValue();
16049 // Narrow shuffle + insert is better than the wide alternative.
16050 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16051 UndefLower, DAG);
16052 }
16053
16054 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16055 return SDValue();
16056}
16057
16058/// Handle case where shuffle sources are coming from the same 128-bit lane and
16059/// every lane can be represented as the same repeating mask - allowing us to
16060/// shuffle the sources with the repeating shuffle and then permute the result
16061/// to the destination lanes.
16063 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16064 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16065 int NumElts = VT.getVectorNumElements();
16066 int NumLanes = VT.getSizeInBits() / 128;
16067 int NumLaneElts = NumElts / NumLanes;
16068
16069 // On AVX2 we may be able to just shuffle the lowest elements and then
16070 // broadcast the result.
16071 if (Subtarget.hasAVX2()) {
16072 for (unsigned BroadcastSize : {16, 32, 64}) {
16073 if (BroadcastSize <= VT.getScalarSizeInBits())
16074 continue;
16075 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16076
16077 // Attempt to match a repeating pattern every NumBroadcastElts,
16078 // accounting for UNDEFs but only references the lowest 128-bit
16079 // lane of the inputs.
16080 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16081 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16082 for (int j = 0; j != NumBroadcastElts; ++j) {
16083 int M = Mask[i + j];
16084 if (M < 0)
16085 continue;
16086 int &R = RepeatMask[j];
16087 if (0 != ((M % NumElts) / NumLaneElts))
16088 return false;
16089 if (0 <= R && R != M)
16090 return false;
16091 R = M;
16092 }
16093 return true;
16094 };
16095
16096 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16097 if (!FindRepeatingBroadcastMask(RepeatMask))
16098 continue;
16099
16100 // Shuffle the (lowest) repeated elements in place for broadcast.
16101 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16102
16103 // Shuffle the actual broadcast.
16104 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16105 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16106 for (int j = 0; j != NumBroadcastElts; ++j)
16107 BroadcastMask[i + j] = j;
16108
16109 // Avoid returning the same shuffle operation. For example,
16110 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16111 if (BroadcastMask == Mask)
16112 return SDValue();
16113
16114 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16115 BroadcastMask);
16116 }
16117 }
16118
16119 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16120 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16121 return SDValue();
16122
16123 // Bail if we already have a repeated lane shuffle mask.
16124 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16125 return SDValue();
16126
16127 // Helper to look for repeated mask in each split sublane, and that those
16128 // sublanes can then be permuted into place.
16129 auto ShuffleSubLanes = [&](int SubLaneScale) {
16130 int NumSubLanes = NumLanes * SubLaneScale;
16131 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16132
16133 // Check that all the sources are coming from the same lane and see if we
16134 // can form a repeating shuffle mask (local to each sub-lane). At the same
16135 // time, determine the source sub-lane for each destination sub-lane.
16136 int TopSrcSubLane = -1;
16137 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16138 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16139 SubLaneScale,
16140 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16141
16142 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16143 // Extract the sub-lane mask, check that it all comes from the same lane
16144 // and normalize the mask entries to come from the first lane.
16145 int SrcLane = -1;
16146 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16147 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16148 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16149 if (M < 0)
16150 continue;
16151 int Lane = (M % NumElts) / NumLaneElts;
16152 if ((0 <= SrcLane) && (SrcLane != Lane))
16153 return SDValue();
16154 SrcLane = Lane;
16155 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16156 SubLaneMask[Elt] = LocalM;
16157 }
16158
16159 // Whole sub-lane is UNDEF.
16160 if (SrcLane < 0)
16161 continue;
16162
16163 // Attempt to match against the candidate repeated sub-lane masks.
16164 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16165 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16166 for (int i = 0; i != NumSubLaneElts; ++i) {
16167 if (M1[i] < 0 || M2[i] < 0)
16168 continue;
16169 if (M1[i] != M2[i])
16170 return false;
16171 }
16172 return true;
16173 };
16174
16175 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16176 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16177 continue;
16178
16179 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16180 for (int i = 0; i != NumSubLaneElts; ++i) {
16181 int M = SubLaneMask[i];
16182 if (M < 0)
16183 continue;
16184 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16185 "Unexpected mask element");
16186 RepeatedSubLaneMask[i] = M;
16187 }
16188
16189 // Track the top most source sub-lane - by setting the remaining to
16190 // UNDEF we can greatly simplify shuffle matching.
16191 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16192 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16193 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16194 break;
16195 }
16196
16197 // Bail if we failed to find a matching repeated sub-lane mask.
16198 if (Dst2SrcSubLanes[DstSubLane] < 0)
16199 return SDValue();
16200 }
16201 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16202 "Unexpected source lane");
16203
16204 // Create a repeating shuffle mask for the entire vector.
16205 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16206 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16207 int Lane = SubLane / SubLaneScale;
16208 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16209 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16210 int M = RepeatedSubLaneMask[Elt];
16211 if (M < 0)
16212 continue;
16213 int Idx = (SubLane * NumSubLaneElts) + Elt;
16214 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16215 }
16216 }
16217
16218 // Shuffle each source sub-lane to its destination.
16219 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16220 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16221 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16222 if (SrcSubLane < 0)
16223 continue;
16224 for (int j = 0; j != NumSubLaneElts; ++j)
16225 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16226 }
16227
16228 // Avoid returning the same shuffle operation.
16229 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16230 if (RepeatedMask == Mask || SubLaneMask == Mask)
16231 return SDValue();
16232
16233 SDValue RepeatedShuffle =
16234 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16235
16236 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16237 SubLaneMask);
16238 };
16239
16240 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16241 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16242 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16243 // Otherwise we can only permute whole 128-bit lanes.
16244 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16245 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16246 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16247 MinSubLaneScale = 2;
16248 MaxSubLaneScale =
16249 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16250 }
16251 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16252 MinSubLaneScale = MaxSubLaneScale = 4;
16253
16254 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16255 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16256 return Shuffle;
16257
16258 return SDValue();
16259}
16260
16262 bool &ForceV1Zero, bool &ForceV2Zero,
16263 unsigned &ShuffleImm, ArrayRef<int> Mask,
16264 const APInt &Zeroable) {
16265 int NumElts = VT.getVectorNumElements();
16266 assert(VT.getScalarSizeInBits() == 64 &&
16267 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16268 "Unexpected data type for VSHUFPD");
16269 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16270 "Illegal shuffle mask");
16271
16272 bool ZeroLane[2] = { true, true };
16273 for (int i = 0; i < NumElts; ++i)
16274 ZeroLane[i & 1] &= Zeroable[i];
16275
16276 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16277 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16278 bool IsSHUFPD = true;
16279 bool IsCommutable = true;
16280 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16281 for (int i = 0; i < NumElts; ++i) {
16282 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16283 continue;
16284 if (Mask[i] < 0)
16285 return false;
16286 int Val = (i & 6) + NumElts * (i & 1);
16287 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16288 if (Mask[i] < Val || Mask[i] > Val + 1)
16289 IsSHUFPD = false;
16290 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16291 IsCommutable = false;
16292 SHUFPDMask[i] = Mask[i] % 2;
16293 }
16294
16295 if (!IsSHUFPD && !IsCommutable)
16296 return false;
16297
16298 if (!IsSHUFPD && IsCommutable)
16299 std::swap(V1, V2);
16300
16301 ForceV1Zero = ZeroLane[0];
16302 ForceV2Zero = ZeroLane[1];
16303 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16304 return true;
16305}
16306
16308 SDValue V2, ArrayRef<int> Mask,
16309 const APInt &Zeroable,
16310 const X86Subtarget &Subtarget,
16311 SelectionDAG &DAG) {
16312 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16313 "Unexpected data type for VSHUFPD");
16314
16315 unsigned Immediate = 0;
16316 bool ForceV1Zero = false, ForceV2Zero = false;
16317 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16318 Mask, Zeroable))
16319 return SDValue();
16320
16321 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16322 if (ForceV1Zero)
16323 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16324 if (ForceV2Zero)
16325 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16326
16327 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16328 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16329}
16330
16331// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16332// by zeroable elements in the remaining 24 elements. Turn this into two
16333// vmovqb instructions shuffled together.
16335 SDValue V1, SDValue V2,
16336 ArrayRef<int> Mask,
16337 const APInt &Zeroable,
16338 SelectionDAG &DAG) {
16339 assert(VT == MVT::v32i8 && "Unexpected type!");
16340
16341 // The first 8 indices should be every 8th element.
16342 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16343 return SDValue();
16344
16345 // Remaining elements need to be zeroable.
16346 if (Zeroable.countl_one() < (Mask.size() - 8))
16347 return SDValue();
16348
16349 V1 = DAG.getBitcast(MVT::v4i64, V1);
16350 V2 = DAG.getBitcast(MVT::v4i64, V2);
16351
16352 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16353 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16354
16355 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16356 // the upper bits of the result using an unpckldq.
16357 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16358 { 0, 1, 2, 3, 16, 17, 18, 19,
16359 4, 5, 6, 7, 20, 21, 22, 23 });
16360 // Insert the unpckldq into a zero vector to widen to v32i8.
16361 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16362 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16363 DAG.getVectorIdxConstant(0, DL));
16364}
16365
16366// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16367// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16368// =>
16369// ul = unpckl v1, v2
16370// uh = unpckh v1, v2
16371// a = vperm ul, uh
16372// b = vperm ul, uh
16373//
16374// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16375// and permute. We cannot directly match v3 because it is split into two
16376// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16377// pair of 256-bit shuffles and makes sure the masks are consecutive.
16378//
16379// Once unpck and permute nodes are created, the permute corresponding to this
16380// shuffle is returned, while the other permute replaces the other half of the
16381// shuffle in the selection dag.
16383 SDValue V1, SDValue V2,
16384 ArrayRef<int> Mask,
16385 SelectionDAG &DAG) {
16386 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16387 VT != MVT::v32i8)
16388 return SDValue();
16389 // <B0, B1, B0+1, B1+1, ..., >
16390 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16391 unsigned Begin1) {
16392 size_t Size = Mask.size();
16393 assert(Size % 2 == 0 && "Expected even mask size");
16394 for (unsigned I = 0; I < Size; I += 2) {
16395 if (Mask[I] != (int)(Begin0 + I / 2) ||
16396 Mask[I + 1] != (int)(Begin1 + I / 2))
16397 return false;
16398 }
16399 return true;
16400 };
16401 // Check which half is this shuffle node
16402 int NumElts = VT.getVectorNumElements();
16403 size_t FirstQtr = NumElts / 2;
16404 size_t ThirdQtr = NumElts + NumElts / 2;
16405 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16406 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16407 if (!IsFirstHalf && !IsSecondHalf)
16408 return SDValue();
16409
16410 // Find the intersection between shuffle users of V1 and V2.
16411 SmallVector<SDNode *, 2> Shuffles;
16412 for (SDNode *User : V1->users())
16413 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16414 User->getOperand(1) == V2)
16415 Shuffles.push_back(User);
16416 // Limit user size to two for now.
16417 if (Shuffles.size() != 2)
16418 return SDValue();
16419 // Find out which half of the 512-bit shuffles is each smaller shuffle
16420 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16421 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16422 SDNode *FirstHalf;
16423 SDNode *SecondHalf;
16424 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16425 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16426 FirstHalf = Shuffles[0];
16427 SecondHalf = Shuffles[1];
16428 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16429 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16430 FirstHalf = Shuffles[1];
16431 SecondHalf = Shuffles[0];
16432 } else {
16433 return SDValue();
16434 }
16435 // Lower into unpck and perm. Return the perm of this shuffle and replace
16436 // the other.
16437 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16438 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16439 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16440 DAG.getTargetConstant(0x20, DL, MVT::i8));
16441 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16442 DAG.getTargetConstant(0x31, DL, MVT::i8));
16443 if (IsFirstHalf) {
16444 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16445 return Perm1;
16446 }
16447 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16448 return Perm2;
16449}
16450
16451/// Handle lowering of 4-lane 64-bit floating point shuffles.
16452///
16453/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16454/// isn't available.
16456 const APInt &Zeroable, SDValue V1, SDValue V2,
16457 const X86Subtarget &Subtarget,
16458 SelectionDAG &DAG) {
16459 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16461 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16462
16463 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG))
16465 return V;
16466
16467 if (V2.isUndef()) {
16468 // Check for being able to broadcast a single element.
16469 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16470 Mask, Subtarget, DAG))
16471 return Broadcast;
16472
16473 // Use low duplicate instructions for masks that match their pattern.
16474 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16475 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16476
16477 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16478 // Non-half-crossing single input shuffles can be lowered with an
16479 // interleaved permutation.
16480 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16481 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16482 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16483 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16484 }
16485
16486 // With AVX2 we have direct support for this permutation.
16487 if (Subtarget.hasAVX2())
16488 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16489 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16490
16491 // Try to create an in-lane repeating shuffle mask and then shuffle the
16492 // results into the target lanes.
16494 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16495 return V;
16496
16497 // Try to permute the lanes and then use a per-lane permute.
16498 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16499 Mask, DAG, Subtarget))
16500 return V;
16501
16502 // Otherwise, fall back.
16503 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16504 DAG, Subtarget);
16505 }
16506
16507 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16508 Zeroable, Subtarget, DAG))
16509 return Blend;
16510
16511 // Use dedicated unpack instructions for masks that match their pattern.
16512 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16513 return V;
16514
16515 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16516 Zeroable, Subtarget, DAG))
16517 return Op;
16518
16519 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16520 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16521 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16522 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16523
16524 // If we have lane crossing shuffles AND they don't all come from the lower
16525 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16526 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16527 // canonicalize to a blend of splat which isn't necessary for this combine.
16528 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16529 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16530 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16532 (!Subtarget.hasAVX2() ||
16533 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16534 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16535
16536 // If we have one input in place, then we can permute the other input and
16537 // blend the result.
16538 if (V1IsInPlace || V2IsInPlace)
16539 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16540 Zeroable, Subtarget, DAG);
16541
16542 // Try to create an in-lane repeating shuffle mask and then shuffle the
16543 // results into the target lanes.
16545 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16546 return V;
16547
16548 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16549 // shuffle. However, if we have AVX2 and either inputs are already in place,
16550 // we will be able to shuffle even across lanes the other input in a single
16551 // instruction so skip this pattern.
16552 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16554 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16555 return V;
16556
16557 // If we have VLX support, we can use VEXPAND.
16558 if (Subtarget.hasVLX())
16559 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16560 Zeroable, Subtarget, DAG))
16561 return V;
16562
16563 // If we have AVX2 then we always want to lower with a blend because an v4 we
16564 // can fully permute the elements.
16565 if (Subtarget.hasAVX2())
16566 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16567 Zeroable, Subtarget, DAG);
16568
16569 // Otherwise fall back on generic lowering.
16570 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16571 Subtarget, DAG);
16572}
16573
16574/// Handle lowering of 4-lane 64-bit integer shuffles.
16575///
16576/// This routine is only called when we have AVX2 and thus a reasonable
16577/// instruction set for v4i64 shuffling..
16579 const APInt &Zeroable, SDValue V1, SDValue V2,
16580 const X86Subtarget &Subtarget,
16581 SelectionDAG &DAG) {
16582 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16584 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16585 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16586
16587 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16588 Subtarget, DAG))
16589 return V;
16590
16591 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16592 Zeroable, Subtarget, DAG))
16593 return Blend;
16594
16595 // Check for being able to broadcast a single element.
16596 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16597 Subtarget, DAG))
16598 return Broadcast;
16599
16600 // Try to use shift instructions if fast.
16601 if (Subtarget.preferLowerShuffleAsShift())
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16604 Subtarget, DAG, /*BitwiseOnly*/ true))
16605 return Shift;
16606
16607 if (V2.isUndef()) {
16608 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16609 // can use lower latency instructions that will operate on both lanes.
16610 SmallVector<int, 2> RepeatedMask;
16611 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16612 SmallVector<int, 4> PSHUFDMask;
16613 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16614 return DAG.getBitcast(
16615 MVT::v4i64,
16616 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16617 DAG.getBitcast(MVT::v8i32, V1),
16618 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16619 }
16620
16621 // AVX2 provides a direct instruction for permuting a single input across
16622 // lanes.
16623 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16624 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16625 }
16626
16627 // Try to use shift instructions.
16628 if (SDValue Shift =
16629 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16630 DAG, /*BitwiseOnly*/ false))
16631 return Shift;
16632
16633 // If we have VLX support, we can use VALIGN or VEXPAND.
16634 if (Subtarget.hasVLX()) {
16635 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16636 Zeroable, Subtarget, DAG))
16637 return Rotate;
16638
16639 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16640 Zeroable, Subtarget, DAG))
16641 return V;
16642 }
16643
16644 // Try to use PALIGNR.
16645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16646 Subtarget, DAG))
16647 return Rotate;
16648
16649 // Use dedicated unpack instructions for masks that match their pattern.
16650 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16651 return V;
16652
16653 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16654 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16655
16656 // If we have one input in place, then we can permute the other input and
16657 // blend the result.
16658 if (V1IsInPlace || V2IsInPlace)
16659 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16660 Zeroable, Subtarget, DAG);
16661
16662 // Try to create an in-lane repeating shuffle mask and then shuffle the
16663 // results into the target lanes.
16665 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16666 return V;
16667
16668 // Try to lower to PERMQ(BLENDD(V1,V2)).
16669 if (SDValue V =
16670 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16671 return V;
16672
16673 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16674 // shuffle. However, if we have AVX2 and either inputs are already in place,
16675 // we will be able to shuffle even across lanes the other input in a single
16676 // instruction so skip this pattern.
16677 if (!V1IsInPlace && !V2IsInPlace)
16679 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16680 return Result;
16681
16682 // Otherwise fall back on generic blend lowering.
16683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16684 Zeroable, Subtarget, DAG);
16685}
16686
16687/// Handle lowering of 8-lane 32-bit floating point shuffles.
16688///
16689/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16690/// isn't available.
16692 const APInt &Zeroable, SDValue V1, SDValue V2,
16693 const X86Subtarget &Subtarget,
16694 SelectionDAG &DAG) {
16695 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16697 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16698
16699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16700 Zeroable, Subtarget, DAG))
16701 return Blend;
16702
16703 // Check for being able to broadcast a single element.
16704 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16705 Subtarget, DAG))
16706 return Broadcast;
16707
16708 if (!Subtarget.hasAVX2()) {
16709 SmallVector<int> InLaneMask;
16710 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16711
16712 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16713 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16714 /*SimpleOnly*/ true))
16715 return R;
16716 }
16717 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16718 Zeroable, Subtarget, DAG))
16719 return DAG.getBitcast(MVT::v8f32, ZExt);
16720
16721 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16722 // options to efficiently lower the shuffle.
16723 SmallVector<int, 4> RepeatedMask;
16724 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16725 assert(RepeatedMask.size() == 4 &&
16726 "Repeated masks must be half the mask width!");
16727
16728 // Use even/odd duplicate instructions for masks that match their pattern.
16729 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16730 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16731 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16732 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16733
16734 if (V2.isUndef())
16735 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16736 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16737
16738 // Use dedicated unpack instructions for masks that match their pattern.
16739 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16740 return V;
16741
16742 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16743 // have already handled any direct blends.
16744 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16745 }
16746
16747 // Try to create an in-lane repeating shuffle mask and then shuffle the
16748 // results into the target lanes.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return V;
16752
16753 // If we have a single input shuffle with different shuffle patterns in the
16754 // two 128-bit lanes use the variable mask to VPERMILPS.
16755 if (V2.isUndef()) {
16756 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16757 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16758 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16759 }
16760 if (Subtarget.hasAVX2()) {
16761 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16762 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16763 }
16764 // Otherwise, fall back.
16765 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16766 DAG, Subtarget);
16767 }
16768
16769 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16770 // shuffle.
16772 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16773 return Result;
16774
16775 // If we have VLX support, we can use VEXPAND.
16776 if (Subtarget.hasVLX())
16777 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG))
16779 return V;
16780
16781 // Try to match an interleave of two v8f32s and lower them as unpck and
16782 // permutes using ymms. This needs to go before we try to split the vectors.
16783 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16784 if ((Subtarget.hasAVX2() ||
16787 !Subtarget.hasAVX512())
16788 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16789 Mask, DAG))
16790 return V;
16791
16792 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16793 // since after split we get a more efficient code using vpunpcklwd and
16794 // vpunpckhwd instrs than vblend.
16795 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16796 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16797 Subtarget, DAG);
16798
16799 // If we have AVX2 then we always want to lower with a blend because at v8 we
16800 // can fully permute the elements.
16801 if (Subtarget.hasAVX2())
16802 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG);
16804
16805 // Otherwise fall back on generic lowering.
16806 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16807 Subtarget, DAG);
16808}
16809
16810/// Handle lowering of 8-lane 32-bit integer shuffles.
16811///
16812/// This routine is only called when we have AVX2 and thus a reasonable
16813/// instruction set for v8i32 shuffling..
16815 const APInt &Zeroable, SDValue V1, SDValue V2,
16816 const X86Subtarget &Subtarget,
16817 SelectionDAG &DAG) {
16818 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16820 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16821 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16822
16823 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16824
16825 // Whenever we can lower this as a zext, that instruction is strictly faster
16826 // than any alternative. It also allows us to fold memory operands into the
16827 // shuffle in many cases.
16828 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16829 Zeroable, Subtarget, DAG))
16830 return ZExt;
16831
16832 // Try to match an interleave of two v8i32s and lower them as unpck and
16833 // permutes using ymms. This needs to go before we try to split the vectors.
16834 if (!Subtarget.hasAVX512())
16835 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16836 Mask, DAG))
16837 return V;
16838
16839 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16840 // since after split we get a more efficient code than vblend by using
16841 // vpunpcklwd and vpunpckhwd instrs.
16842 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16843 !Subtarget.hasAVX512())
16844 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16845 Subtarget, DAG);
16846
16847 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16848 Zeroable, Subtarget, DAG))
16849 return Blend;
16850
16851 // Check for being able to broadcast a single element.
16852 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16853 Subtarget, DAG))
16854 return Broadcast;
16855
16856 // Try to use shift instructions if fast.
16857 if (Subtarget.preferLowerShuffleAsShift()) {
16858 if (SDValue Shift =
16859 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG, /*BitwiseOnly*/ true))
16861 return Shift;
16862 if (NumV2Elements == 0)
16863 if (SDValue Rotate =
16864 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16865 return Rotate;
16866 }
16867
16868 // If the shuffle mask is repeated in each 128-bit lane we can use more
16869 // efficient instructions that mirror the shuffles across the two 128-bit
16870 // lanes.
16871 SmallVector<int, 4> RepeatedMask;
16872 bool Is128BitLaneRepeatedShuffle =
16873 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16874 if (Is128BitLaneRepeatedShuffle) {
16875 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16876 if (V2.isUndef())
16877 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16878 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16879
16880 // Use dedicated unpack instructions for masks that match their pattern.
16881 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16882 return V;
16883 }
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16892 if (SDValue Rotate =
16893 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16894 return Rotate;
16895
16896 // If we have VLX support, we can use VALIGN or EXPAND.
16897 if (Subtarget.hasVLX()) {
16898 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16899 Zeroable, Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16903 Zeroable, Subtarget, DAG))
16904 return V;
16905 }
16906
16907 // Try to use byte rotation instructions.
16908 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16909 Subtarget, DAG))
16910 return Rotate;
16911
16912 // Try to create an in-lane repeating shuffle mask and then shuffle the
16913 // results into the target lanes.
16915 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16916 return V;
16917
16918 if (V2.isUndef()) {
16919 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16920 // because that should be faster than the variable permute alternatives.
16921 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16922 return V;
16923
16924 // If the shuffle patterns aren't repeated but it's a single input, directly
16925 // generate a cross-lane VPERMD instruction.
16926 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16927 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16928 }
16929
16930 // Assume that a single SHUFPS is faster than an alternative sequence of
16931 // multiple instructions (even if the CPU has a domain penalty).
16932 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16933 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16934 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16935 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16936 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16937 CastV1, CastV2, DAG);
16938 return DAG.getBitcast(MVT::v8i32, ShufPS);
16939 }
16940
16941 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16942 // shuffle.
16944 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16945 return Result;
16946
16947 // Otherwise fall back on generic blend lowering.
16948 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16949 Zeroable, Subtarget, DAG);
16950}
16951
16952/// Handle lowering of 16-lane 16-bit integer shuffles.
16953///
16954/// This routine is only called when we have AVX2 and thus a reasonable
16955/// instruction set for v16i16 shuffling..
16957 const APInt &Zeroable, SDValue V1, SDValue V2,
16958 const X86Subtarget &Subtarget,
16959 SelectionDAG &DAG) {
16960 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16962 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16963 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16964
16965 // Whenever we can lower this as a zext, that instruction is strictly faster
16966 // than any alternative. It also allows us to fold memory operands into the
16967 // shuffle in many cases.
16969 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16970 return ZExt;
16971
16972 // Check for being able to broadcast a single element.
16973 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16974 Subtarget, DAG))
16975 return Broadcast;
16976
16977 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16978 Zeroable, Subtarget, DAG))
16979 return Blend;
16980
16981 // Use dedicated unpack instructions for masks that match their pattern.
16982 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16983 return V;
16984
16985 // Use dedicated pack instructions for masks that match their pattern.
16986 if (SDValue V =
16987 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 // Try to use lower using a truncation.
16991 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16992 Subtarget, DAG))
16993 return V;
16994
16995 // Try to use shift instructions.
16996 if (SDValue Shift =
16997 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16998 Subtarget, DAG, /*BitwiseOnly*/ false))
16999 return Shift;
17000
17001 // Try to use byte rotation instructions.
17002 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17003 Subtarget, DAG))
17004 return Rotate;
17005
17006 // Try to create an in-lane repeating shuffle mask and then shuffle the
17007 // results into the target lanes.
17009 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17010 return V;
17011
17012 if (V2.isUndef()) {
17013 // Try to use bit rotation instructions.
17014 if (SDValue Rotate =
17015 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17016 return Rotate;
17017
17018 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17019 // because that should be faster than the variable permute alternatives.
17020 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17021 return V;
17022
17023 // There are no generalized cross-lane shuffle operations available on i16
17024 // element types.
17025 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17027 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17028 return V;
17029
17030 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17031 DAG, Subtarget);
17032 }
17033
17034 SmallVector<int, 8> RepeatedMask;
17035 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17036 // As this is a single-input shuffle, the repeated mask should be
17037 // a strictly valid v8i16 mask that we can pass through to the v8i16
17038 // lowering to handle even the v16 case.
17040 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17041 }
17042 }
17043
17044 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17045 Zeroable, Subtarget, DAG))
17046 return PSHUFB;
17047
17048 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17049 if (Subtarget.hasBWI())
17050 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17051
17052 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17053 // shuffle.
17055 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17056 return Result;
17057
17058 // Try to permute the lanes and then use a per-lane permute.
17060 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17061 return V;
17062
17063 // Try to match an interleave of two v16i16s and lower them as unpck and
17064 // permutes using ymms.
17065 if (!Subtarget.hasAVX512())
17066 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17067 Mask, DAG))
17068 return V;
17069
17070 // Otherwise fall back on generic lowering.
17071 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17072 Subtarget, DAG);
17073}
17074
17075/// Handle lowering of 32-lane 8-bit integer shuffles.
17076///
17077/// This routine is only called when we have AVX2 and thus a reasonable
17078/// instruction set for v32i8 shuffling..
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17085 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17086 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17091 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17092 Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Check for being able to broadcast a single element.
17096 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17097 Subtarget, DAG))
17098 return Broadcast;
17099
17100 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17101 Zeroable, Subtarget, DAG))
17102 return Blend;
17103
17104 // Use dedicated unpack instructions for masks that match their pattern.
17105 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17106 return V;
17107
17108 // Use dedicated pack instructions for masks that match their pattern.
17109 if (SDValue V =
17110 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17111 return V;
17112
17113 // Try to use lower using a truncation.
17114 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17115 Subtarget, DAG))
17116 return V;
17117
17118 // Try to use shift instructions.
17119 if (SDValue Shift =
17120 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17121 DAG, /*BitwiseOnly*/ false))
17122 return Shift;
17123
17124 // Try to use byte rotation instructions.
17125 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17126 Subtarget, DAG))
17127 return Rotate;
17128
17129 // Try to use bit rotation instructions.
17130 if (V2.isUndef())
17131 if (SDValue Rotate =
17132 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17133 return Rotate;
17134
17135 // Try to create an in-lane repeating shuffle mask and then shuffle the
17136 // results into the target lanes.
17138 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17139 return V;
17140
17141 // There are no generalized cross-lane shuffle operations available on i8
17142 // element types.
17143 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17144 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17145 // because that should be faster than the variable permute alternatives.
17146 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17147 return V;
17148
17150 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17151 return V;
17152
17153 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17154 DAG, Subtarget);
17155 }
17156
17157 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17158 Zeroable, Subtarget, DAG))
17159 return PSHUFB;
17160
17161 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17162 if (Subtarget.hasVBMI())
17163 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17164
17165 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17166 // shuffle.
17168 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17169 return Result;
17170
17171 // Try to permute the lanes and then use a per-lane permute.
17173 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17174 return V;
17175
17176 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17177 // by zeroable elements in the remaining 24 elements. Turn this into two
17178 // vmovqb instructions shuffled together.
17179 if (Subtarget.hasVLX())
17180 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17181 Mask, Zeroable, DAG))
17182 return V;
17183
17184 // Try to match an interleave of two v32i8s and lower them as unpck and
17185 // permutes using ymms.
17186 if (!Subtarget.hasAVX512())
17187 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17188 Mask, DAG))
17189 return V;
17190
17191 // Otherwise fall back on generic lowering.
17192 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17193 Subtarget, DAG);
17194}
17195
17196/// High-level routine to lower various 256-bit x86 vector shuffles.
17197///
17198/// This routine either breaks down the specific type of a 256-bit x86 vector
17199/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17200/// together based on the available instructions.
17202 SDValue V1, SDValue V2, const APInt &Zeroable,
17203 const X86Subtarget &Subtarget,
17204 SelectionDAG &DAG) {
17205 // If we have a single input to the zero element, insert that into V1 if we
17206 // can do so cheaply.
17207 int NumElts = VT.getVectorNumElements();
17208 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17209
17210 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17212 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17213 return Insertion;
17214
17215 // Handle special cases where the lower or upper half is UNDEF.
17216 if (SDValue V =
17217 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17218 return V;
17219
17220 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17221 // can check for those subtargets here and avoid much of the subtarget
17222 // querying in the per-vector-type lowering routines. With AVX1 we have
17223 // essentially *zero* ability to manipulate a 256-bit vector with integer
17224 // types. Since we'll use floating point types there eventually, just
17225 // immediately cast everything to a float and operate entirely in that domain.
17226 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17227 int ElementBits = VT.getScalarSizeInBits();
17228 if (ElementBits < 32) {
17229 // No floating point type available, if we can't use the bit operations
17230 // for masking/blending then decompose into 128-bit vectors.
17231 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17232 Subtarget, DAG))
17233 return V;
17234 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17235 return V;
17236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17237 }
17238
17239 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17241 V1 = DAG.getBitcast(FpVT, V1);
17242 V2 = DAG.getBitcast(FpVT, V2);
17243 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17244 }
17245
17246 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17247 V1 = DAG.getBitcast(MVT::v16i16, V1);
17248 V2 = DAG.getBitcast(MVT::v16i16, V2);
17249 return DAG.getBitcast(VT,
17250 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17251 }
17252
17253 switch (VT.SimpleTy) {
17254 case MVT::v4f64:
17255 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v4i64:
17257 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v8f32:
17259 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v8i32:
17261 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v16i16:
17263 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v32i8:
17265 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266
17267 default:
17268 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17269 }
17270}
17271
17272/// Try to lower a vector shuffle as a 128-bit shuffles.
17274 const APInt &Zeroable, SDValue V1, SDValue V2,
17275 const X86Subtarget &Subtarget,
17276 SelectionDAG &DAG) {
17277 assert(VT.getScalarSizeInBits() == 64 &&
17278 "Unexpected element type size for 128bit shuffle.");
17279
17280 // To handle 256 bit vector requires VLX and most probably
17281 // function lowerV2X128VectorShuffle() is better solution.
17282 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17283
17284 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17285 SmallVector<int, 4> Widened128Mask;
17286 if (!canWidenShuffleElements(Mask, Widened128Mask))
17287 return SDValue();
17288 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17289
17290 // Try to use an insert into a zero vector.
17291 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17292 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17293 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17294 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17295 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17296 DAG.getVectorIdxConstant(0, DL));
17297 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17298 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17299 DAG.getVectorIdxConstant(0, DL));
17300 }
17301
17302 // Check for patterns which can be matched with a single insert of a 256-bit
17303 // subvector.
17304 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17305 if (OnlyUsesV1 ||
17306 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17307 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17308 SDValue SubVec =
17309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17310 DAG.getVectorIdxConstant(0, DL));
17311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17312 DAG.getVectorIdxConstant(4, DL));
17313 }
17314
17315 // See if this is an insertion of the lower 128-bits of V2 into V1.
17316 bool IsInsert = true;
17317 int V2Index = -1;
17318 for (int i = 0; i < 4; ++i) {
17319 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17320 if (Widened128Mask[i] < 0)
17321 continue;
17322
17323 // Make sure all V1 subvectors are in place.
17324 if (Widened128Mask[i] < 4) {
17325 if (Widened128Mask[i] != i) {
17326 IsInsert = false;
17327 break;
17328 }
17329 } else {
17330 // Make sure we only have a single V2 index and its the lowest 128-bits.
17331 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17332 IsInsert = false;
17333 break;
17334 }
17335 V2Index = i;
17336 }
17337 }
17338 if (IsInsert && V2Index >= 0) {
17339 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17340 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17341 DAG.getVectorIdxConstant(0, DL));
17342 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17343 }
17344
17345 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17346 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17347 // possible we at least ensure the lanes stay sequential to help later
17348 // combines.
17349 SmallVector<int, 2> Widened256Mask;
17350 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17351 Widened128Mask.clear();
17352 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17353 }
17354
17355 // Try to lower to vshuf64x2/vshuf32x4.
17356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17357 int PermMask[4] = {-1, -1, -1, -1};
17358 // Ensure elements came from the same Op.
17359 for (int i = 0; i < 4; ++i) {
17360 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17361 if (Widened128Mask[i] < 0)
17362 continue;
17363
17364 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17365 unsigned OpIndex = i / 2;
17366 if (Ops[OpIndex].isUndef())
17367 Ops[OpIndex] = Op;
17368 else if (Ops[OpIndex] != Op)
17369 return SDValue();
17370
17371 PermMask[i] = Widened128Mask[i] % 4;
17372 }
17373
17374 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17375 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17376}
17377
17378/// Handle lowering of 8-lane 64-bit floating point shuffles.
17380 const APInt &Zeroable, SDValue V1, SDValue V2,
17381 const X86Subtarget &Subtarget,
17382 SelectionDAG &DAG) {
17383 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17385 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17386
17387 if (V2.isUndef()) {
17388 // Use low duplicate instructions for masks that match their pattern.
17389 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17390 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17391
17392 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17393 // Non-half-crossing single input shuffles can be lowered with an
17394 // interleaved permutation.
17395 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17396 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17397 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17398 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17399 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17400 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17401 }
17402
17403 SmallVector<int, 4> RepeatedMask;
17404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17405 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17406 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17407 }
17408
17409 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17410 V2, Subtarget, DAG))
17411 return Shuf128;
17412
17413 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17414 return Unpck;
17415
17416 // Check if the blend happens to exactly fit that of SHUFPD.
17417 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17418 Zeroable, Subtarget, DAG))
17419 return Op;
17420
17421 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17422 Subtarget, DAG))
17423 return V;
17424
17425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17426 Zeroable, Subtarget, DAG))
17427 return Blend;
17428
17429 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17430}
17431
17432/// Handle lowering of 16-lane 32-bit floating point shuffles.
17434 const APInt &Zeroable, SDValue V1, SDValue V2,
17435 const X86Subtarget &Subtarget,
17436 SelectionDAG &DAG) {
17437 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17439 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17440
17441 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17442 // options to efficiently lower the shuffle.
17443 SmallVector<int, 4> RepeatedMask;
17444 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17445 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17446
17447 // Use even/odd duplicate instructions for masks that match their pattern.
17448 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17449 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17450 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17451 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17452
17453 if (V2.isUndef())
17454 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17455 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17456
17457 // Use dedicated unpack instructions for masks that match their pattern.
17458 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17459 return V;
17460
17461 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17462 Zeroable, Subtarget, DAG))
17463 return Blend;
17464
17465 // Otherwise, fall back to a SHUFPS sequence.
17466 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17467 }
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17474 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17475 return DAG.getBitcast(MVT::v16f32, ZExt);
17476
17477 // Try to create an in-lane repeating shuffle mask and then shuffle the
17478 // results into the target lanes.
17480 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17481 return V;
17482
17483 // If we have a single input shuffle with different shuffle patterns in the
17484 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17485 if (V2.isUndef() &&
17486 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17487 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17488 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17489 }
17490
17491 // If we have AVX512F support, we can use VEXPAND.
17492 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17493 Zeroable, Subtarget, DAG))
17494 return V;
17495
17496 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17497}
17498
17499/// Handle lowering of 8-lane 64-bit integer shuffles.
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17506 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17507
17508 // Try to use shift instructions if fast.
17509 if (Subtarget.preferLowerShuffleAsShift())
17510 if (SDValue Shift =
17511 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17512 Subtarget, DAG, /*BitwiseOnly*/ true))
17513 return Shift;
17514
17515 if (V2.isUndef()) {
17516 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17517 // can use lower latency instructions that will operate on all four
17518 // 128-bit lanes.
17519 SmallVector<int, 2> Repeated128Mask;
17520 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17521 SmallVector<int, 4> PSHUFDMask;
17522 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17523 return DAG.getBitcast(
17524 MVT::v8i64,
17525 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17526 DAG.getBitcast(MVT::v16i32, V1),
17527 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17528 }
17529
17530 SmallVector<int, 4> Repeated256Mask;
17531 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17532 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17533 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17534 }
17535
17536 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17537 V2, Subtarget, DAG))
17538 return Shuf128;
17539
17540 // Try to use shift instructions.
17541 if (SDValue Shift =
17542 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17543 DAG, /*BitwiseOnly*/ false))
17544 return Shift;
17545
17546 // Try to use VALIGN.
17547 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17548 Zeroable, Subtarget, DAG))
17549 return Rotate;
17550
17551 // Try to use PALIGNR.
17552 if (Subtarget.hasBWI())
17553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17554 Subtarget, DAG))
17555 return Rotate;
17556
17557 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17558 return Unpck;
17559
17560 // If we have AVX512F support, we can use VEXPAND.
17561 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564
17565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17566 Zeroable, Subtarget, DAG))
17567 return Blend;
17568
17569 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17570}
17571
17572/// Handle lowering of 16-lane 32-bit integer shuffles.
17574 const APInt &Zeroable, SDValue V1, SDValue V2,
17575 const X86Subtarget &Subtarget,
17576 SelectionDAG &DAG) {
17577 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17579 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17580
17581 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17582
17583 // Whenever we can lower this as a zext, that instruction is strictly faster
17584 // than any alternative. It also allows us to fold memory operands into the
17585 // shuffle in many cases.
17587 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17588 return ZExt;
17589
17590 // Try to use shift instructions if fast.
17591 if (Subtarget.preferLowerShuffleAsShift()) {
17592 if (SDValue Shift =
17593 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17594 Subtarget, DAG, /*BitwiseOnly*/ true))
17595 return Shift;
17596 if (NumV2Elements == 0)
17597 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17598 Subtarget, DAG))
17599 return Rotate;
17600 }
17601
17602 // If the shuffle mask is repeated in each 128-bit lane we can use more
17603 // efficient instructions that mirror the shuffles across the four 128-bit
17604 // lanes.
17605 SmallVector<int, 4> RepeatedMask;
17606 bool Is128BitLaneRepeatedShuffle =
17607 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17608 if (Is128BitLaneRepeatedShuffle) {
17609 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17610 if (V2.isUndef())
17611 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17612 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17613
17614 // Use dedicated unpack instructions for masks that match their pattern.
17615 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17616 return V;
17617 }
17618
17619 // Try to use shift instructions.
17620 if (SDValue Shift =
17621 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17622 Subtarget, DAG, /*BitwiseOnly*/ false))
17623 return Shift;
17624
17625 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17626 if (SDValue Rotate =
17627 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17628 return Rotate;
17629
17630 // Try to use VALIGN.
17631 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17632 Zeroable, Subtarget, DAG))
17633 return Rotate;
17634
17635 // Try to use byte rotation instructions.
17636 if (Subtarget.hasBWI())
17637 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17638 Subtarget, DAG))
17639 return Rotate;
17640
17641 // Assume that a single SHUFPS is faster than using a permv shuffle.
17642 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17643 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17644 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17645 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17646 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17647 CastV1, CastV2, DAG);
17648 return DAG.getBitcast(MVT::v16i32, ShufPS);
17649 }
17650
17651 // Try to create an in-lane repeating shuffle mask and then shuffle the
17652 // results into the target lanes.
17654 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17655 return V;
17656
17657 // If we have AVX512F support, we can use VEXPAND.
17658 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17659 Zeroable, Subtarget, DAG))
17660 return V;
17661
17662 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17663 Zeroable, Subtarget, DAG))
17664 return Blend;
17665
17666 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17667}
17668
17669/// Handle lowering of 32-lane 16-bit integer shuffles.
17671 const APInt &Zeroable, SDValue V1, SDValue V2,
17672 const X86Subtarget &Subtarget,
17673 SelectionDAG &DAG) {
17674 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17676 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17677 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17678
17679 // Whenever we can lower this as a zext, that instruction is strictly faster
17680 // than any alternative. It also allows us to fold memory operands into the
17681 // shuffle in many cases.
17683 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17684 return ZExt;
17685
17686 // Use dedicated unpack instructions for masks that match their pattern.
17687 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17688 return V;
17689
17690 // Use dedicated pack instructions for masks that match their pattern.
17691 if (SDValue V =
17692 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17693 return V;
17694
17695 // Try to use shift instructions.
17696 if (SDValue Shift =
17697 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17698 Subtarget, DAG, /*BitwiseOnly*/ false))
17699 return Shift;
17700
17701 // Try to use byte rotation instructions.
17702 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17703 Subtarget, DAG))
17704 return Rotate;
17705
17706 if (V2.isUndef()) {
17707 // Try to use bit rotation instructions.
17708 if (SDValue Rotate =
17709 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17710 return Rotate;
17711
17712 SmallVector<int, 8> RepeatedMask;
17713 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17714 // As this is a single-input shuffle, the repeated mask should be
17715 // a strictly valid v8i16 mask that we can pass through to the v8i16
17716 // lowering to handle even the v32 case.
17717 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17718 RepeatedMask, Subtarget, DAG);
17719 }
17720 }
17721
17722 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17723 Zeroable, Subtarget, DAG))
17724 return Blend;
17725
17726 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17727 Zeroable, Subtarget, DAG))
17728 return PSHUFB;
17729
17730 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17731 // shuffle.
17732 if (!V2.isUndef())
17734 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17735 return Result;
17736
17737 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17738}
17739
17740/// Handle lowering of 64-lane 8-bit integer shuffles.
17742 const APInt &Zeroable, SDValue V1, SDValue V2,
17743 const X86Subtarget &Subtarget,
17744 SelectionDAG &DAG) {
17745 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17747 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17748 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17749
17750 // Whenever we can lower this as a zext, that instruction is strictly faster
17751 // than any alternative. It also allows us to fold memory operands into the
17752 // shuffle in many cases.
17754 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17755 return ZExt;
17756
17757 // Use dedicated unpack instructions for masks that match their pattern.
17758 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17759 return V;
17760
17761 // Use dedicated pack instructions for masks that match their pattern.
17762 if (SDValue V =
17763 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17764 return V;
17765
17766 // Try to use shift instructions.
17767 if (SDValue Shift =
17768 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17769 DAG, /*BitwiseOnly*/ false))
17770 return Shift;
17771
17772 // Try to use byte rotation instructions.
17773 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17774 Subtarget, DAG))
17775 return Rotate;
17776
17777 // Try to use bit rotation instructions.
17778 if (V2.isUndef())
17779 if (SDValue Rotate =
17780 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17781 return Rotate;
17782
17783 // Lower as AND if possible.
17784 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17785 Zeroable, Subtarget, DAG))
17786 return Masked;
17787
17788 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17789 Zeroable, Subtarget, DAG))
17790 return PSHUFB;
17791
17792 // Try to create an in-lane repeating shuffle mask and then shuffle the
17793 // results into the target lanes.
17795 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17796 return V;
17797
17799 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17800 return Result;
17801
17802 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17803 Zeroable, Subtarget, DAG))
17804 return Blend;
17805
17806 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17807 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17808 // PALIGNR will be cheaper than the second PSHUFB+OR.
17809 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17810 Mask, Subtarget, DAG))
17811 return V;
17812
17813 // If we can't directly blend but can use PSHUFB, that will be better as it
17814 // can both shuffle and set up the inefficient blend.
17815 bool V1InUse, V2InUse;
17816 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17817 DAG, V1InUse, V2InUse);
17818 }
17819
17820 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17821 // shuffle.
17822 if (!V2.isUndef())
17824 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17825 return Result;
17826
17827 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17828 if (Subtarget.hasVBMI())
17829 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17830
17831 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17832}
17833
17834/// High-level routine to lower various 512-bit x86 vector shuffles.
17835///
17836/// This routine either breaks down the specific type of a 512-bit x86 vector
17837/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17838/// together based on the available instructions.
17840 MVT VT, SDValue V1, SDValue V2,
17841 const APInt &Zeroable,
17842 const X86Subtarget &Subtarget,
17843 SelectionDAG &DAG) {
17844 assert(Subtarget.hasAVX512() &&
17845 "Cannot lower 512-bit vectors w/ basic ISA!");
17846
17847 // If we have a single input to the zero element, insert that into V1 if we
17848 // can do so cheaply.
17849 int NumElts = Mask.size();
17850 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17851
17852 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17854 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17855 return Insertion;
17856
17857 // Handle special cases where the lower or upper half is UNDEF.
17858 if (SDValue V =
17859 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17860 return V;
17861
17862 // Check for being able to broadcast a single element.
17863 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17864 Subtarget, DAG))
17865 return Broadcast;
17866
17867 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17868 // Try using bit ops for masking and blending before falling back to
17869 // splitting.
17870 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17871 Subtarget, DAG))
17872 return V;
17873 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17874 return V;
17875
17876 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17877 }
17878
17879 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17880 if (!Subtarget.hasBWI())
17881 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17882 /*SimpleOnly*/ false);
17883
17884 V1 = DAG.getBitcast(MVT::v32i16, V1);
17885 V2 = DAG.getBitcast(MVT::v32i16, V2);
17886 return DAG.getBitcast(VT,
17887 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17888 }
17889
17890 // Dispatch to each element type for lowering. If we don't have support for
17891 // specific element type shuffles at 512 bits, immediately split them and
17892 // lower them. Each lowering routine of a given type is allowed to assume that
17893 // the requisite ISA extensions for that element type are available.
17894 switch (VT.SimpleTy) {
17895 case MVT::v8f64:
17896 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17897 case MVT::v16f32:
17898 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v8i64:
17900 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v16i32:
17902 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v32i16:
17904 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v64i8:
17906 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907
17908 default:
17909 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17910 }
17911}
17912
17914 MVT VT, SDValue V1, SDValue V2,
17915 const X86Subtarget &Subtarget,
17916 SelectionDAG &DAG) {
17917 // Shuffle should be unary.
17918 if (!V2.isUndef())
17919 return SDValue();
17920
17921 int ShiftAmt = -1;
17922 int NumElts = Mask.size();
17923 for (int i = 0; i != NumElts; ++i) {
17924 int M = Mask[i];
17925 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17926 "Unexpected mask index.");
17927 if (M < 0)
17928 continue;
17929
17930 // The first non-undef element determines our shift amount.
17931 if (ShiftAmt < 0) {
17932 ShiftAmt = M - i;
17933 // Need to be shifting right.
17934 if (ShiftAmt <= 0)
17935 return SDValue();
17936 }
17937 // All non-undef elements must shift by the same amount.
17938 if (ShiftAmt != M - i)
17939 return SDValue();
17940 }
17941 assert(ShiftAmt >= 0 && "All undef?");
17942
17943 // Great we found a shift right.
17944 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17945 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17946 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17948 DAG.getVectorIdxConstant(0, DL));
17949}
17950
17951// Determine if this shuffle can be implemented with a KSHIFT instruction.
17952// Returns the shift amount if possible or -1 if not. This is a simplified
17953// version of matchShuffleAsShift.
17954static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17955 int MaskOffset, const APInt &Zeroable) {
17956 int Size = Mask.size();
17957
17958 auto CheckZeros = [&](int Shift, bool Left) {
17959 for (int j = 0; j < Shift; ++j)
17960 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17961 return false;
17962
17963 return true;
17964 };
17965
17966 auto MatchShift = [&](int Shift, bool Left) {
17967 unsigned Pos = Left ? Shift : 0;
17968 unsigned Low = Left ? 0 : Shift;
17969 unsigned Len = Size - Shift;
17970 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17971 };
17972
17973 for (int Shift = 1; Shift != Size; ++Shift)
17974 for (bool Left : {true, false})
17975 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17977 return Shift;
17978 }
17979
17980 return -1;
17981}
17982
17983
17984// Lower vXi1 vector shuffles.
17985// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17986// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17987// vector, shuffle and then truncate it back.
17989 MVT VT, SDValue V1, SDValue V2,
17990 const APInt &Zeroable,
17991 const X86Subtarget &Subtarget,
17992 SelectionDAG &DAG) {
17993 assert(Subtarget.hasAVX512() &&
17994 "Cannot lower 512-bit vectors w/o basic ISA!");
17995
17996 int NumElts = Mask.size();
17997 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17998
17999 // Try to recognize shuffles that are just padding a subvector with zeros.
18000 int SubvecElts = 0;
18001 int Src = -1;
18002 for (int i = 0; i != NumElts; ++i) {
18003 if (Mask[i] >= 0) {
18004 // Grab the source from the first valid mask. All subsequent elements need
18005 // to use this same source.
18006 if (Src < 0)
18007 Src = Mask[i] / NumElts;
18008 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18009 break;
18010 }
18011
18012 ++SubvecElts;
18013 }
18014 assert(SubvecElts != NumElts && "Identity shuffle?");
18015
18016 // Clip to a power 2.
18017 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18018
18019 // Make sure the number of zeroable bits in the top at least covers the bits
18020 // not covered by the subvector.
18021 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18022 assert(Src >= 0 && "Expected a source!");
18023 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18024 SDValue Extract =
18025 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18026 DAG.getVectorIdxConstant(0, DL));
18027 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18028 DAG.getConstant(0, DL, VT), Extract,
18029 DAG.getVectorIdxConstant(0, DL));
18030 }
18031
18032 // Try a simple shift right with undef elements. Later we'll try with zeros.
18033 if (SDValue Shift =
18034 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18035 return Shift;
18036
18037 // Try to match KSHIFTs.
18038 unsigned Offset = 0;
18039 for (SDValue V : {V1, V2}) {
18040 unsigned Opcode;
18041 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18042 if (ShiftAmt >= 0) {
18043 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18044 MVT WideVT = Res.getSimpleValueType();
18045 // Widened right shifts need two shifts to ensure we shift in zeroes.
18046 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18047 int WideElts = WideVT.getVectorNumElements();
18048 // Shift left to put the original vector in the MSBs of the new size.
18049 Res =
18050 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18051 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18052 // Increase the shift amount to account for the left shift.
18053 ShiftAmt += WideElts - NumElts;
18054 }
18055
18056 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18057 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18059 DAG.getVectorIdxConstant(0, DL));
18060 }
18061 Offset += NumElts; // Increment for next iteration.
18062 }
18063
18064 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18065 // ops instead.
18066 // TODO: What other unary shuffles would benefit from this?
18067 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18068 SDValue Op0 = V1.getOperand(0);
18069 SDValue Op1 = V1.getOperand(1);
18071 EVT OpVT = Op0.getValueType();
18072 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18073 return DAG.getSetCC(
18074 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18075 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18076 }
18077
18078 MVT ExtVT;
18079 switch (VT.SimpleTy) {
18080 default:
18081 llvm_unreachable("Expected a vector of i1 elements");
18082 case MVT::v2i1:
18083 ExtVT = MVT::v2i64;
18084 break;
18085 case MVT::v4i1:
18086 ExtVT = MVT::v4i32;
18087 break;
18088 case MVT::v8i1:
18089 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18090 // shuffle.
18091 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18092 break;
18093 case MVT::v16i1:
18094 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18095 // 256-bit operation available.
18096 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18097 break;
18098 case MVT::v32i1:
18099 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18100 // 256-bit operation available.
18101 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18102 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18103 break;
18104 case MVT::v64i1:
18105 // Fall back to scalarization. FIXME: We can do better if the shuffle
18106 // can be partitioned cleanly.
18107 if (!Subtarget.useBWIRegs())
18108 return SDValue();
18109 ExtVT = MVT::v64i8;
18110 break;
18111 }
18112
18113 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18114 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18115
18116 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18117 // i1 was sign extended we can use X86ISD::CVT2MASK.
18118 int NumElems = VT.getVectorNumElements();
18119 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18120 (Subtarget.hasDQI() && (NumElems < 32)))
18121 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18122 Shuffle, ISD::SETGT);
18123
18124 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18125}
18126
18127/// Helper function that returns true if the shuffle mask should be
18128/// commuted to improve canonicalization.
18130 int NumElements = Mask.size();
18131
18132 int NumV1Elements = 0, NumV2Elements = 0;
18133 for (int M : Mask)
18134 if (M < 0)
18135 continue;
18136 else if (M < NumElements)
18137 ++NumV1Elements;
18138 else
18139 ++NumV2Elements;
18140
18141 // Commute the shuffle as needed such that more elements come from V1 than
18142 // V2. This allows us to match the shuffle pattern strictly on how many
18143 // elements come from V1 without handling the symmetric cases.
18144 if (NumV2Elements > NumV1Elements)
18145 return true;
18146
18147 assert(NumV1Elements > 0 && "No V1 indices");
18148
18149 if (NumV2Elements == 0)
18150 return false;
18151
18152 // When the number of V1 and V2 elements are the same, try to minimize the
18153 // number of uses of V2 in the low half of the vector. When that is tied,
18154 // ensure that the sum of indices for V1 is equal to or lower than the sum
18155 // indices for V2. When those are equal, try to ensure that the number of odd
18156 // indices for V1 is lower than the number of odd indices for V2.
18157 if (NumV1Elements == NumV2Elements) {
18158 int LowV1Elements = 0, LowV2Elements = 0;
18159 for (int M : Mask.slice(0, NumElements / 2))
18160 if (M >= NumElements)
18161 ++LowV2Elements;
18162 else if (M >= 0)
18163 ++LowV1Elements;
18164 if (LowV2Elements > LowV1Elements)
18165 return true;
18166 if (LowV2Elements == LowV1Elements) {
18167 int SumV1Indices = 0, SumV2Indices = 0;
18168 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18169 if (Mask[i] >= NumElements)
18170 SumV2Indices += i;
18171 else if (Mask[i] >= 0)
18172 SumV1Indices += i;
18173 if (SumV2Indices < SumV1Indices)
18174 return true;
18175 if (SumV2Indices == SumV1Indices) {
18176 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18177 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18178 if (Mask[i] >= NumElements)
18179 NumV2OddIndices += i % 2;
18180 else if (Mask[i] >= 0)
18181 NumV1OddIndices += i % 2;
18182 if (NumV2OddIndices < NumV1OddIndices)
18183 return true;
18184 }
18185 }
18186 }
18187
18188 return false;
18189}
18190
18192 const X86Subtarget &Subtarget) {
18193 if (!Subtarget.hasAVX512())
18194 return false;
18195
18196 if (!V.getValueType().isSimple())
18197 return false;
18198
18199 MVT VT = V.getSimpleValueType().getScalarType();
18200 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18201 return false;
18202
18203 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18204 // are preferable to blendw/blendvb/masked-mov.
18205 if ((VT == MVT::i16 || VT == MVT::i8) &&
18206 V.getSimpleValueType().getSizeInBits() < 512)
18207 return false;
18208
18209 auto HasMaskOperation = [&](SDValue V) {
18210 // TODO: Currently we only check limited opcode. We probably extend
18211 // it to all binary operation by checking TLI.isBinOp().
18212 switch (V->getOpcode()) {
18213 default:
18214 return false;
18215 case ISD::ADD:
18216 case ISD::SUB:
18217 case ISD::AND:
18218 case ISD::XOR:
18219 case ISD::OR:
18220 case ISD::SMAX:
18221 case ISD::SMIN:
18222 case ISD::UMAX:
18223 case ISD::UMIN:
18224 case ISD::ABS:
18225 case ISD::SHL:
18226 case ISD::SRL:
18227 case ISD::SRA:
18228 case ISD::MUL:
18229 break;
18230 }
18231 if (!V->hasOneUse())
18232 return false;
18233
18234 return true;
18235 };
18236
18237 if (HasMaskOperation(V))
18238 return true;
18239
18240 return false;
18241}
18242
18243// Forward declaration.
18246 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18247 const X86Subtarget &Subtarget);
18248
18249 /// Top-level lowering for x86 vector shuffles.
18250///
18251/// This handles decomposition, canonicalization, and lowering of all x86
18252/// vector shuffles. Most of the specific lowering strategies are encapsulated
18253/// above in helper routines. The canonicalization attempts to widen shuffles
18254/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18255/// s.t. only one of the two inputs needs to be tested, etc.
18257 SelectionDAG &DAG) {
18259 ArrayRef<int> OrigMask = SVOp->getMask();
18260 SDValue V1 = Op.getOperand(0);
18261 SDValue V2 = Op.getOperand(1);
18262 MVT VT = Op.getSimpleValueType();
18263 int NumElements = VT.getVectorNumElements();
18264 SDLoc DL(Op);
18265 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18266
18267 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18268 "Can't lower MMX shuffles");
18269
18270 bool V1IsUndef = V1.isUndef();
18271 bool V2IsUndef = V2.isUndef();
18272 if (V1IsUndef && V2IsUndef)
18273 return DAG.getUNDEF(VT);
18274
18275 // When we create a shuffle node we put the UNDEF node to second operand,
18276 // but in some cases the first operand may be transformed to UNDEF.
18277 // In this case we should just commute the node.
18278 if (V1IsUndef)
18279 return DAG.getCommutedVectorShuffle(*SVOp);
18280
18281 // Check for non-undef masks pointing at an undef vector and make the masks
18282 // undef as well. This makes it easier to match the shuffle based solely on
18283 // the mask.
18284 if (V2IsUndef &&
18285 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18286 SmallVector<int, 8> NewMask(OrigMask);
18287 for (int &M : NewMask)
18288 if (M >= NumElements)
18289 M = -1;
18290 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18291 }
18292
18293 // Check for illegal shuffle mask element index values.
18294 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18295 (void)MaskUpperLimit;
18296 assert(llvm::all_of(OrigMask,
18297 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18298 "Out of bounds shuffle index");
18299
18300 // We actually see shuffles that are entirely re-arrangements of a set of
18301 // zero inputs. This mostly happens while decomposing complex shuffles into
18302 // simple ones. Directly lower these as a buildvector of zeros.
18303 APInt KnownUndef, KnownZero;
18304 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18305
18306 APInt Zeroable = KnownUndef | KnownZero;
18307 if (Zeroable.isAllOnes())
18308 return getZeroVector(VT, Subtarget, DAG, DL);
18309
18310 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18311
18312 // Try to collapse shuffles into using a vector type with fewer elements but
18313 // wider element types. We cap this to not form integers or floating point
18314 // elements wider than 64 bits. It does not seem beneficial to form i128
18315 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18316 SmallVector<int, 16> WidenedMask;
18317 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18318 !canCombineAsMaskOperation(V1, Subtarget) &&
18319 !canCombineAsMaskOperation(V2, Subtarget) &&
18320 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18321 // Shuffle mask widening should not interfere with a broadcast opportunity
18322 // by obfuscating the operands with bitcasts.
18323 // TODO: Avoid lowering directly from this top-level function: make this
18324 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18325 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18326 Subtarget, DAG))
18327 return Broadcast;
18328
18329 MVT NewEltVT = VT.isFloatingPoint()
18332 int NewNumElts = NumElements / 2;
18333 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18334 // Make sure that the new vector type is legal. For example, v2f64 isn't
18335 // legal on SSE1.
18336 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18337 if (V2IsZero) {
18338 // Modify the new Mask to take all zeros from the all-zero vector.
18339 // Choose indices that are blend-friendly.
18340 bool UsedZeroVector = false;
18341 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18342 "V2's non-undef elements are used?!");
18343 for (int i = 0; i != NewNumElts; ++i)
18344 if (WidenedMask[i] == SM_SentinelZero) {
18345 WidenedMask[i] = i + NewNumElts;
18346 UsedZeroVector = true;
18347 }
18348 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18349 // some elements to be undef.
18350 if (UsedZeroVector)
18351 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18352 }
18353 V1 = DAG.getBitcast(NewVT, V1);
18354 V2 = DAG.getBitcast(NewVT, V2);
18355 return DAG.getBitcast(
18356 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18357 }
18358 }
18359
18360 SmallVector<SDValue> Ops = {V1, V2};
18361 SmallVector<int> Mask(OrigMask);
18362
18363 // Canonicalize the shuffle with any horizontal ops inputs.
18364 // NOTE: This may update Ops and Mask.
18366 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18367 return DAG.getBitcast(VT, HOp);
18368
18369 V1 = DAG.getBitcast(VT, Ops[0]);
18370 V2 = DAG.getBitcast(VT, Ops[1]);
18371 assert(NumElements == (int)Mask.size() &&
18372 "canonicalizeShuffleMaskWithHorizOp "
18373 "shouldn't alter the shuffle mask size");
18374
18375 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18376 // These will be materialized uniformly anyway, so make splat matching easier.
18377 // TODO: Allow all int constants?
18378 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18379 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18380 BitVector Undefs;
18381 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18382 if (Undefs.any() &&
18385 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18386 }
18387 }
18388 }
18389 return V;
18390 };
18391 V1 = CanonicalizeConstant(V1);
18392 V2 = CanonicalizeConstant(V2);
18393
18394 // Commute the shuffle if it will improve canonicalization.
18397 std::swap(V1, V2);
18398 }
18399
18400 // For each vector width, delegate to a specialized lowering routine.
18401 if (VT.is128BitVector())
18402 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18403
18404 if (VT.is256BitVector())
18405 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18406
18407 if (VT.is512BitVector())
18408 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18409
18410 if (Is1BitVector)
18411 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18412
18413 llvm_unreachable("Unimplemented!");
18414}
18415
18416// As legal vpcompress instructions depend on various AVX512 extensions, try to
18417// convert illegal vector sizes to legal ones to avoid expansion.
18419 SelectionDAG &DAG) {
18420 assert(Subtarget.hasAVX512() &&
18421 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18422
18423 SDLoc DL(Op);
18424 SDValue Vec = Op.getOperand(0);
18425 SDValue Mask = Op.getOperand(1);
18426 SDValue Passthru = Op.getOperand(2);
18427
18428 EVT VecVT = Vec.getValueType();
18429 EVT ElementVT = VecVT.getVectorElementType();
18430 unsigned NumElements = VecVT.getVectorNumElements();
18431 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18432 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18433
18434 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18435 // compressed as 512-bit vectors in AVX512F.
18436 if (NumVecBits != 128 && NumVecBits != 256)
18437 return SDValue();
18438
18439 if (NumElementBits == 32 || NumElementBits == 64) {
18440 unsigned NumLargeElements = 512 / NumElementBits;
18441 MVT LargeVecVT =
18442 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18443 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18444
18445 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18446 DAG, DL);
18447 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18448 Subtarget, DAG, DL);
18449 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18450 : widenSubVector(LargeVecVT, Passthru,
18451 /*ZeroNewElements=*/false,
18452 Subtarget, DAG, DL);
18453
18454 SDValue Compressed =
18455 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18456 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18457 DAG.getConstant(0, DL, MVT::i64));
18458 }
18459
18460 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18461 VecVT == MVT::v16i16) {
18462 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18463 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18464
18465 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18466 Passthru = Passthru.isUndef()
18467 ? DAG.getUNDEF(LargeVecVT)
18468 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18469
18470 SDValue Compressed =
18471 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18472 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18473 }
18474
18475 return SDValue();
18476}
18477
18478/// Try to lower a VSELECT instruction to a vector shuffle.
18480 const X86Subtarget &Subtarget,
18481 SelectionDAG &DAG) {
18482 SDValue Cond = Op.getOperand(0);
18483 SDValue LHS = Op.getOperand(1);
18484 SDValue RHS = Op.getOperand(2);
18485 MVT VT = Op.getSimpleValueType();
18486
18487 // Only non-legal VSELECTs reach this lowering, convert those into generic
18488 // shuffles and re-use the shuffle lowering path for blends.
18492 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18493 }
18494
18495 return SDValue();
18496}
18497
18498SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18499 SDValue Cond = Op.getOperand(0);
18500 SDValue LHS = Op.getOperand(1);
18501 SDValue RHS = Op.getOperand(2);
18502
18503 SDLoc dl(Op);
18504 MVT VT = Op.getSimpleValueType();
18505 if (isSoftF16(VT, Subtarget)) {
18506 MVT NVT = VT.changeVectorElementTypeToInteger();
18507 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18508 DAG.getBitcast(NVT, LHS),
18509 DAG.getBitcast(NVT, RHS)));
18510 }
18511
18512 // A vselect where all conditions and data are constants can be optimized into
18513 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18517 return SDValue();
18518
18519 // Try to lower this to a blend-style vector shuffle. This can handle all
18520 // constant condition cases.
18521 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18522 return BlendOp;
18523
18524 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18525 // with patterns on the mask registers on AVX-512.
18526 MVT CondVT = Cond.getSimpleValueType();
18527 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18528 if (CondEltSize == 1)
18529 return Op;
18530
18531 // Variable blends are only legal from SSE4.1 onward.
18532 if (!Subtarget.hasSSE41())
18533 return SDValue();
18534
18535 unsigned EltSize = VT.getScalarSizeInBits();
18536 unsigned NumElts = VT.getVectorNumElements();
18537
18538 // Expand v32i16/v64i8 without BWI.
18539 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18540 return SDValue();
18541
18542 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18543 // into an i1 condition so that we can use the mask-based 512-bit blend
18544 // instructions.
18545 if (VT.getSizeInBits() == 512) {
18546 // Build a mask by testing the condition against zero.
18547 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18548 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18549 DAG.getConstant(0, dl, CondVT),
18550 ISD::SETNE);
18551 // Now return a new VSELECT using the mask.
18552 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18553 }
18554
18555 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18556 if (CondEltSize != EltSize) {
18557 // If we don't have a sign splat, rely on the expansion.
18558 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18559 return SDValue();
18560
18561 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18562 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18563 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18564 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18565 }
18566
18567 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18568 // are free to split, then better to split before expanding the
18569 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18570 // TODO: This is very similar to narrowVectorSelect.
18571 // TODO: Add Load splitting to isFreeToSplitVector ?
18572 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18573 !Subtarget.hasXOP()) {
18574 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18575 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18576 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18577 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18578 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18579 if (FreeCond && (FreeLHS || FreeRHS))
18580 return splitVectorOp(Op, DAG, dl);
18581 }
18582
18583 // Only some types will be legal on some subtargets. If we can emit a legal
18584 // VSELECT-matching blend, return Op, and but if we need to expand, return
18585 // a null value.
18586 switch (VT.SimpleTy) {
18587 default:
18588 // Most of the vector types have blends past SSE4.1.
18589 return Op;
18590
18591 case MVT::v32i8:
18592 // The byte blends for AVX vectors were introduced only in AVX2.
18593 if (Subtarget.hasAVX2())
18594 return Op;
18595
18596 return SDValue();
18597
18598 case MVT::v8i16:
18599 case MVT::v16i16:
18600 case MVT::v8f16:
18601 case MVT::v16f16: {
18602 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18603 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18604 Cond = DAG.getBitcast(CastVT, Cond);
18605 LHS = DAG.getBitcast(CastVT, LHS);
18606 RHS = DAG.getBitcast(CastVT, RHS);
18607 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18608 return DAG.getBitcast(VT, Select);
18609 }
18610 }
18611}
18612
18614 MVT VT = Op.getSimpleValueType();
18615 SDValue Vec = Op.getOperand(0);
18616 SDValue Idx = Op.getOperand(1);
18617 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18618 SDLoc dl(Op);
18619
18621 return SDValue();
18622
18623 if (VT.getSizeInBits() == 8) {
18624 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18625 // we're going to zero extend the register or fold the store.
18628 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18629 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18631
18632 unsigned IdxVal = Idx->getAsZExtVal();
18633 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18634 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18636 }
18637
18638 if (VT == MVT::f32) {
18639 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18640 // the result back to FR32 register. It's only worth matching if the
18641 // result has a single use which is a store or a bitcast to i32. And in
18642 // the case of a store, it's not worth it if the index is a constant 0,
18643 // because a MOVSSmr can be used instead, which is smaller and faster.
18644 if (!Op.hasOneUse())
18645 return SDValue();
18646 SDNode *User = *Op.getNode()->user_begin();
18647 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18648 (User->getOpcode() != ISD::BITCAST ||
18649 User->getValueType(0) != MVT::i32))
18650 return SDValue();
18651 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18652 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18653 return DAG.getBitcast(MVT::f32, Extract);
18654 }
18655
18656 if (VT == MVT::i32 || VT == MVT::i64)
18657 return Op;
18658
18659 return SDValue();
18660}
18661
18662/// Extract one bit from mask vector, like v16i1 or v8i1.
18663/// AVX-512 feature.
18665 const X86Subtarget &Subtarget) {
18666 SDValue Vec = Op.getOperand(0);
18667 SDLoc dl(Vec);
18668 MVT VecVT = Vec.getSimpleValueType();
18669 SDValue Idx = Op.getOperand(1);
18670 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18671 MVT EltVT = Op.getSimpleValueType();
18672
18673 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18674 "Unexpected vector type in ExtractBitFromMaskVector");
18675
18676 // variable index can't be handled in mask registers,
18677 // extend vector to VR512/128
18678 if (!IdxC) {
18679 unsigned NumElts = VecVT.getVectorNumElements();
18680 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18681 // than extending to 128/256bit.
18682 if (NumElts == 1) {
18683 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18685 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18686 }
18687 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18688 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18689 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18690 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18691 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18692 }
18693
18694 unsigned IdxVal = IdxC->getZExtValue();
18695 if (IdxVal == 0) // the operation is legal
18696 return Op;
18697
18698 // Extend to natively supported kshift.
18699 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18700
18701 // Use kshiftr instruction to move to the lower element.
18702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18703 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18704
18705 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18706 DAG.getVectorIdxConstant(0, dl));
18707}
18708
18709// Helper to find all the extracted elements from a vector.
18711 MVT VT = N->getSimpleValueType(0);
18712 unsigned NumElts = VT.getVectorNumElements();
18713 APInt DemandedElts = APInt::getZero(NumElts);
18714 for (SDNode *User : N->users()) {
18715 switch (User->getOpcode()) {
18716 case X86ISD::PEXTRB:
18717 case X86ISD::PEXTRW:
18720 DemandedElts.setAllBits();
18721 return DemandedElts;
18722 }
18723 DemandedElts.setBit(User->getConstantOperandVal(1));
18724 break;
18725 case ISD::BITCAST: {
18726 if (!User->getValueType(0).isSimple() ||
18727 !User->getValueType(0).isVector()) {
18728 DemandedElts.setAllBits();
18729 return DemandedElts;
18730 }
18731 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18732 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18733 break;
18734 }
18735 default:
18736 DemandedElts.setAllBits();
18737 return DemandedElts;
18738 }
18739 }
18740 return DemandedElts;
18741}
18742
18743SDValue
18744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18745 SelectionDAG &DAG) const {
18746 SDLoc dl(Op);
18747 SDValue Vec = Op.getOperand(0);
18748 MVT VecVT = Vec.getSimpleValueType();
18749 SDValue Idx = Op.getOperand(1);
18750 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18751
18752 if (VecVT.getVectorElementType() == MVT::i1)
18753 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18754
18755 if (!IdxC) {
18756 // Its more profitable to go through memory (1 cycles throughput)
18757 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18758 // IACA tool was used to get performance estimation
18759 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18760 //
18761 // example : extractelement <16 x i8> %a, i32 %i
18762 //
18763 // Block Throughput: 3.00 Cycles
18764 // Throughput Bottleneck: Port5
18765 //
18766 // | Num Of | Ports pressure in cycles | |
18767 // | Uops | 0 - DV | 5 | 6 | 7 | |
18768 // ---------------------------------------------
18769 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18770 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18771 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18772 // Total Num Of Uops: 4
18773 //
18774 //
18775 // Block Throughput: 1.00 Cycles
18776 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18777 //
18778 // | | Ports pressure in cycles | |
18779 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18780 // ---------------------------------------------------------
18781 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18782 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18783 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18784 // Total Num Of Uops: 4
18785
18786 return SDValue();
18787 }
18788
18789 unsigned IdxVal = IdxC->getZExtValue();
18790
18791 // If this is a 256-bit vector result, first extract the 128-bit vector and
18792 // then extract the element from the 128-bit vector.
18793 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18794 // Get the 128-bit vector.
18795 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18796 MVT EltVT = VecVT.getVectorElementType();
18797
18798 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18799 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18800
18801 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18802 // this can be done with a mask.
18803 IdxVal &= ElemsPerChunk - 1;
18804 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18805 DAG.getVectorIdxConstant(IdxVal, dl));
18806 }
18807
18808 assert(VecVT.is128BitVector() && "Unexpected vector length");
18809
18810 MVT VT = Op.getSimpleValueType();
18811
18812 if (VT == MVT::i16) {
18813 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18814 // we're going to zero extend the register or fold the store (SSE41 only).
18815 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18816 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18817 if (Subtarget.hasFP16())
18818 return Op;
18819
18820 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18821 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18822 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18823 }
18824
18825 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18826 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18827 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18828 }
18829
18830 if (Subtarget.hasSSE41())
18831 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18832 return Res;
18833
18834 // Only extract a single element from a v16i8 source - determine the common
18835 // DWORD/WORD that all extractions share, and extract the sub-byte.
18836 // TODO: Add QWORD MOVQ extraction?
18837 if (VT == MVT::i8) {
18838 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18839 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18840
18841 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18842 int DWordIdx = IdxVal / 4;
18843 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18844 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18845 DAG.getBitcast(MVT::v4i32, Vec),
18846 DAG.getVectorIdxConstant(DWordIdx, dl));
18847 int ShiftVal = (IdxVal % 4) * 8;
18848 if (ShiftVal != 0)
18849 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18850 DAG.getConstant(ShiftVal, dl, MVT::i8));
18851 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18852 }
18853
18854 int WordIdx = IdxVal / 2;
18855 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18856 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18857 DAG.getBitcast(MVT::v8i16, Vec),
18858 DAG.getVectorIdxConstant(WordIdx, dl));
18859 int ShiftVal = (IdxVal % 2) * 8;
18860 if (ShiftVal != 0)
18861 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18862 DAG.getConstant(ShiftVal, dl, MVT::i8));
18863 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18864 }
18865 }
18866
18867 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18868 if (IdxVal == 0)
18869 return Op;
18870
18871 // Shuffle the element to the lowest element, then movss or movsh.
18872 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18873 Mask[0] = static_cast<int>(IdxVal);
18874 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18876 DAG.getVectorIdxConstant(0, dl));
18877 }
18878
18879 if (VT.getSizeInBits() == 64) {
18880 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18881 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18882 // to match extract_elt for f64.
18883 if (IdxVal == 0)
18884 return Op;
18885
18886 // UNPCKHPD the element to the lowest double word, then movsd.
18887 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18888 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18889 int Mask[2] = { 1, -1 };
18890 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18892 DAG.getVectorIdxConstant(0, dl));
18893 }
18894
18895 return SDValue();
18896}
18897
18898/// Insert one bit to mask vector, like v16i1 or v8i1.
18899/// AVX-512 feature.
18901 const X86Subtarget &Subtarget) {
18902 SDLoc dl(Op);
18903 SDValue Vec = Op.getOperand(0);
18904 SDValue Elt = Op.getOperand(1);
18905 SDValue Idx = Op.getOperand(2);
18906 MVT VecVT = Vec.getSimpleValueType();
18907
18908 if (!isa<ConstantSDNode>(Idx)) {
18909 // Non constant index. Extend source and destination,
18910 // insert element and then truncate the result.
18911 unsigned NumElts = VecVT.getVectorNumElements();
18912 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18913 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18914 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18916 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18917 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18918 }
18919
18920 // Copy into a k-register, extract to v1i1 and insert_subvector.
18921 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18922 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18923}
18924
18925SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18926 SelectionDAG &DAG) const {
18927 MVT VT = Op.getSimpleValueType();
18928 MVT EltVT = VT.getVectorElementType();
18929 unsigned NumElts = VT.getVectorNumElements();
18930 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18931
18932 if (EltVT == MVT::i1)
18933 return InsertBitToMaskVector(Op, DAG, Subtarget);
18934
18935 SDLoc dl(Op);
18936 SDValue N0 = Op.getOperand(0);
18937 SDValue N1 = Op.getOperand(1);
18938 SDValue N2 = Op.getOperand(2);
18939 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18940
18941 if (EltVT == MVT::bf16) {
18942 MVT IVT = VT.changeVectorElementTypeToInteger();
18943 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18944 DAG.getBitcast(IVT, N0),
18945 DAG.getBitcast(MVT::i16, N1), N2);
18946 return DAG.getBitcast(VT, Res);
18947 }
18948
18949 if (!N2C) {
18950 // Variable insertion indices, usually we're better off spilling to stack,
18951 // but AVX512 can use a variable compare+select by comparing against all
18952 // possible vector indices, and FP insertion has less gpr->simd traffic.
18953 if (!(Subtarget.hasBWI() ||
18954 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18955 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18956 return SDValue();
18957
18958 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18959 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18960 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18961 return SDValue();
18962
18963 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18964 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18965 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18966
18967 SmallVector<SDValue, 16> RawIndices;
18968 for (unsigned I = 0; I != NumElts; ++I)
18969 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18970 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18971
18972 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18973 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18975 }
18976
18977 if (N2C->getAPIntValue().uge(NumElts))
18978 return SDValue();
18979 uint64_t IdxVal = N2C->getZExtValue();
18980
18981 bool IsZeroElt = X86::isZeroNode(N1);
18982 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18983
18984 if (IsZeroElt || IsAllOnesElt) {
18985 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18986 // We don't deal with i8 0 since it appears to be handled elsewhere.
18987 if (IsAllOnesElt &&
18988 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18989 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18990 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18991 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18992 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18993 CstVectorElts[IdxVal] = OnesCst;
18994 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18995 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18996 }
18997 // See if we can do this more efficiently with a blend shuffle with a
18998 // rematerializable vector.
18999 if (Subtarget.hasSSE41() &&
19000 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19001 SmallVector<int, 8> BlendMask;
19002 for (unsigned i = 0; i != NumElts; ++i)
19003 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19004 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19005 : getOnesVector(VT, DAG, dl);
19006 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19007 }
19008 }
19009
19010 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19011 // into that, and then insert the subvector back into the result.
19012 if (VT.is256BitVector() || VT.is512BitVector()) {
19013 // With a 256-bit vector, we can insert into the zero element efficiently
19014 // using a blend if we have AVX or AVX2 and the right data type.
19015 if (VT.is256BitVector() && IdxVal == 0) {
19016 // TODO: It is worthwhile to cast integer to floating point and back
19017 // and incur a domain crossing penalty if that's what we'll end up
19018 // doing anyway after extracting to a 128-bit vector.
19019 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19020 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19021 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19022 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19023 DAG.getTargetConstant(1, dl, MVT::i8));
19024 }
19025 }
19026
19027 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19028 assert(isPowerOf2_32(NumEltsIn128) &&
19029 "Vectors will always have power-of-two number of elements.");
19030
19031 // If we are not inserting into the low 128-bit vector chunk,
19032 // then prefer the broadcast+blend sequence.
19033 // FIXME: relax the profitability check iff all N1 uses are insertions.
19034 if (IdxVal >= NumEltsIn128 &&
19035 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19036 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19037 X86::mayFoldLoad(N1, Subtarget)))) {
19038 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19039 SmallVector<int, 8> BlendMask;
19040 for (unsigned i = 0; i != NumElts; ++i)
19041 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19042 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19043 }
19044
19045 // Get the desired 128-bit vector chunk.
19046 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19047
19048 // Insert the element into the desired chunk.
19049 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19050 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19051
19052 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19053 DAG.getVectorIdxConstant(IdxIn128, dl));
19054
19055 // Insert the changed part back into the bigger vector
19056 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19057 }
19058 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19059
19060 // This will be just movw/movd/movq/movsh/movss/movsd.
19061 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19062 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19063 EltVT == MVT::f16 || EltVT == MVT::i64) {
19064 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19065 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19066 }
19067
19068 // We can't directly insert an i8 or i16 into a vector, so zero extend
19069 // it to i32 first.
19070 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19071 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19072 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19073 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19074 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19075 return DAG.getBitcast(VT, N1);
19076 }
19077 }
19078
19079 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19080 // argument. SSE41 required for pinsrb.
19081 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19082 unsigned Opc;
19083 if (VT == MVT::v8i16) {
19084 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19086 } else {
19087 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19088 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19090 }
19091
19092 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19093 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19094 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19095 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19096 }
19097
19098 if (Subtarget.hasSSE41()) {
19099 if (EltVT == MVT::f32) {
19100 // Bits [7:6] of the constant are the source select. This will always be
19101 // zero here. The DAG Combiner may combine an extract_elt index into
19102 // these bits. For example (insert (extract, 3), 2) could be matched by
19103 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19104 // Bits [5:4] of the constant are the destination select. This is the
19105 // value of the incoming immediate.
19106 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19107 // combine either bitwise AND or insert of float 0.0 to set these bits.
19108
19109 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19110 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19111 // If this is an insertion of 32-bits into the low 32-bits of
19112 // a vector, we prefer to generate a blend with immediate rather
19113 // than an insertps. Blends are simpler operations in hardware and so
19114 // will always have equal or better performance than insertps.
19115 // But if optimizing for size and there's a load folding opportunity,
19116 // generate insertps because blendps does not have a 32-bit memory
19117 // operand form.
19118 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19119 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19120 DAG.getTargetConstant(1, dl, MVT::i8));
19121 }
19122 // Create this as a scalar to vector..
19123 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19124 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19125 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19126 }
19127
19128 // PINSR* works with constant index.
19129 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19130 return Op;
19131 }
19132
19133 return SDValue();
19134}
19135
19137 SelectionDAG &DAG) {
19138 SDLoc dl(Op);
19139 MVT OpVT = Op.getSimpleValueType();
19140
19141 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19142 // combines.
19143 if (X86::isZeroNode(Op.getOperand(0)))
19144 return getZeroVector(OpVT, Subtarget, DAG, dl);
19145
19146 // If this is a 256-bit vector result, first insert into a 128-bit
19147 // vector and then insert into the 256-bit vector.
19148 if (!OpVT.is128BitVector()) {
19149 // Insert into a 128-bit vector.
19150 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19152 OpVT.getVectorNumElements() / SizeFactor);
19153
19154 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19155
19156 // Insert the 128-bit vector.
19157 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19158 }
19159 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19160 "Expected an SSE type!");
19161
19162 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19163 // tblgen.
19164 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19165 return Op;
19166
19167 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19168 return DAG.getBitcast(
19169 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19170}
19171
19172// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19173// simple superregister reference or explicit instructions to insert
19174// the upper bits of a vector.
19176 SelectionDAG &DAG) {
19177 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19178
19179 return insert1BitVector(Op, DAG, Subtarget);
19180}
19181
19183 SelectionDAG &DAG) {
19184 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19185 "Only vXi1 extract_subvectors need custom lowering");
19186
19187 SDLoc dl(Op);
19188 SDValue Vec = Op.getOperand(0);
19189 uint64_t IdxVal = Op.getConstantOperandVal(1);
19190
19191 if (IdxVal == 0) // the operation is legal
19192 return Op;
19193
19194 // Extend to natively supported kshift.
19195 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19196
19197 // Shift to the LSB.
19198 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19199 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19200
19201 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19202 DAG.getVectorIdxConstant(0, dl));
19203}
19204
19205// Returns the appropriate wrapper opcode for a global reference.
19206unsigned X86TargetLowering::getGlobalWrapperKind(
19207 const GlobalValue *GV, const unsigned char OpFlags) const {
19208 // References to absolute symbols are never PC-relative.
19209 if (GV && GV->isAbsoluteSymbolRef())
19210 return X86ISD::Wrapper;
19211
19212 // The following OpFlags under RIP-rel PIC use RIP.
19213 if (Subtarget.isPICStyleRIPRel() &&
19214 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19215 OpFlags == X86II::MO_DLLIMPORT))
19216 return X86ISD::WrapperRIP;
19217
19218 // GOTPCREL references must always use RIP.
19219 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19220 return X86ISD::WrapperRIP;
19221
19222 return X86ISD::Wrapper;
19223}
19224
19225// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19226// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19227// one of the above mentioned nodes. It has to be wrapped because otherwise
19228// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19229// be used to form addressing mode. These wrapped nodes will be selected
19230// into MOV32ri.
19231SDValue
19232X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19233 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19234
19235 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19236 // global base reg.
19237 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19238
19239 auto PtrVT = getPointerTy(DAG.getDataLayout());
19241 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19242 SDLoc DL(CP);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245 // With PIC, the address is actually $g + Offset.
19246 if (OpFlag) {
19247 Result =
19248 DAG.getNode(ISD::ADD, DL, PtrVT,
19249 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19250 }
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19256 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19257
19258 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19259 // global base reg.
19260 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19261
19262 EVT PtrVT = Op.getValueType();
19263 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19264 SDLoc DL(JT);
19265 Result =
19266 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19267
19268 // With PIC, the address is actually $g + Offset.
19269 if (OpFlag)
19270 Result =
19271 DAG.getNode(ISD::ADD, DL, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19273
19274 return Result;
19275}
19276
19277SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19278 SelectionDAG &DAG) const {
19279 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19280}
19281
19282SDValue
19283X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19284 // Create the TargetBlockAddressAddress node.
19285 unsigned char OpFlags =
19286 Subtarget.classifyBlockAddressReference();
19287 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19288 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19289 SDLoc dl(Op);
19290 EVT PtrVT = Op.getValueType();
19291 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19292 Result =
19293 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19294
19295 // With PIC, the address is actually $g + Offset.
19296 if (isGlobalRelativeToPICBase(OpFlags)) {
19297 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19298 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19299 }
19300
19301 return Result;
19302}
19303
19304/// Creates target global address or external symbol nodes for calls or
19305/// other uses.
19306SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19307 bool ForCall,
19308 bool *IsImpCall) const {
19309 // Unpack the global address or external symbol.
19310 SDLoc dl(Op);
19311 const GlobalValue *GV = nullptr;
19312 int64_t Offset = 0;
19313 const char *ExternalSym = nullptr;
19314 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19315 GV = G->getGlobal();
19316 Offset = G->getOffset();
19317 } else {
19318 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19319 ExternalSym = ES->getSymbol();
19320 }
19321
19322 // Calculate some flags for address lowering.
19324 unsigned char OpFlags;
19325 if (ForCall)
19326 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19327 else
19328 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19329 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19330 bool NeedsLoad = isGlobalStubReference(OpFlags);
19331
19333 EVT PtrVT = Op.getValueType();
19335
19336 if (GV) {
19337 // Create a target global address if this is a global. If possible, fold the
19338 // offset into the global address reference. Otherwise, ADD it on later.
19339 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19340 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19341 // relocation will compute to a negative value, which is invalid.
19342 int64_t GlobalOffset = 0;
19343 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19345 std::swap(GlobalOffset, Offset);
19346 }
19347 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19348 } else {
19349 // If this is not a global address, this must be an external symbol.
19350 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19351 }
19352
19353 // If this is a direct call, avoid the wrapper if we don't need to do any
19354 // loads or adds. This allows SDAG ISel to match direct calls.
19355 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19356 return Result;
19357
19358 // If Import Call Optimization is enabled and this is an imported function
19359 // then make a note of it and return the global address without wrapping.
19360 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19361 Mod.getModuleFlag("import-call-optimization")) {
19362 assert(ForCall && "Should only enable import call optimization if we are "
19363 "lowering a call");
19364 *IsImpCall = true;
19365 return Result;
19366 }
19367
19368 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19369
19370 // With PIC, the address is actually $g + Offset.
19371 if (HasPICReg) {
19372 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19373 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19374 }
19375
19376 // For globals that require a load from a stub to get the address, emit the
19377 // load.
19378 if (NeedsLoad)
19379 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19381
19382 // If there was a non-zero offset that we didn't fold, create an explicit
19383 // addition for it.
19384 if (Offset != 0)
19385 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19386 DAG.getSignedConstant(Offset, dl, PtrVT));
19387
19388 return Result;
19389}
19390
19391SDValue
19392X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19393 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19394}
19395
19397 const EVT PtrVT, unsigned ReturnReg,
19398 unsigned char OperandFlags,
19399 bool LoadGlobalBaseReg = false,
19400 bool LocalDynamic = false) {
19402 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19403 SDLoc dl(GA);
19404 SDValue TGA;
19405 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19406 SDValue Chain = DAG.getEntryNode();
19407 SDValue Ret;
19408 if (LocalDynamic && UseTLSDESC) {
19409 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19410 // Reuse existing GetTLSADDR node if we can find it.
19411 if (TGA->hasOneUse()) {
19412 // TLSDESC uses TGA.
19413 SDNode *TLSDescOp = *TGA->user_begin();
19414 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19415 "Unexpected TLSDESC DAG");
19416 // CALLSEQ_END uses TGA via a chain and glue.
19417 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19418 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19419 "Unexpected TLSDESC DAG");
19420 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19421 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19422 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19423 "Unexpected TLSDESC DAG");
19424 Ret = SDValue(CopyFromRegOp, 0);
19425 }
19426 } else {
19427 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19428 GA->getOffset(), OperandFlags);
19429 }
19430
19431 if (!Ret) {
19432 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19433 : LocalDynamic ? X86ISD::TLSBASEADDR
19435
19436 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19437 if (LoadGlobalBaseReg) {
19438 SDValue InGlue;
19439 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19440 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19441 InGlue);
19442 InGlue = Chain.getValue(1);
19443 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19444 } else {
19445 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19446 }
19447 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19448
19449 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19450 MFI.setHasCalls(true);
19451
19452 SDValue Glue = Chain.getValue(1);
19453 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19454 }
19455
19456 if (!UseTLSDESC)
19457 return Ret;
19458
19459 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19460 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19461
19463 SDValue Offset =
19464 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19466 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19467}
19468
19469// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19470static SDValue
19472 const EVT PtrVT) {
19473 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19474 /*LoadGlobalBaseReg=*/true);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19478static SDValue
19480 const EVT PtrVT) {
19481 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19482}
19483
19484// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19485static SDValue
19487 const EVT PtrVT) {
19488 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19489}
19490
19492 SelectionDAG &DAG, const EVT PtrVT,
19493 bool Is64Bit, bool Is64BitLP64) {
19494 SDLoc dl(GA);
19495
19496 // Get the start address of the TLS block for this module.
19500
19501 SDValue Base;
19502 if (Is64Bit) {
19503 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19504 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19505 /*LoadGlobalBaseReg=*/false,
19506 /*LocalDynamic=*/true);
19507 } else {
19508 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19509 /*LoadGlobalBaseReg=*/true,
19510 /*LocalDynamic=*/true);
19511 }
19512
19513 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19514 // of Base.
19515
19516 // Build x@dtpoff.
19517 unsigned char OperandFlags = X86II::MO_DTPOFF;
19518 unsigned WrapperKind = X86ISD::Wrapper;
19519 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19520 GA->getValueType(0),
19521 GA->getOffset(), OperandFlags);
19522 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19523
19524 // Add x@dtpoff with the base.
19525 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19526}
19527
19528// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19530 const EVT PtrVT, TLSModel::Model model,
19531 bool is64Bit, bool isPIC) {
19532 SDLoc dl(GA);
19533
19534 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19537
19538 SDValue ThreadPointer =
19539 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19541
19542 unsigned char OperandFlags = 0;
19543 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19544 // initialexec.
19545 unsigned WrapperKind = X86ISD::Wrapper;
19546 if (model == TLSModel::LocalExec) {
19547 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19548 } else if (model == TLSModel::InitialExec) {
19549 if (is64Bit) {
19550 OperandFlags = X86II::MO_GOTTPOFF;
19551 WrapperKind = X86ISD::WrapperRIP;
19552 } else {
19553 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19554 }
19555 } else {
19556 llvm_unreachable("Unexpected model");
19557 }
19558
19559 // emit "addl x@ntpoff,%eax" (local exec)
19560 // or "addl x@indntpoff,%eax" (initial exec)
19561 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19562 SDValue TGA =
19563 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19564 GA->getOffset(), OperandFlags);
19565 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19566
19567 if (model == TLSModel::InitialExec) {
19568 if (isPIC && !is64Bit) {
19569 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19570 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19571 Offset);
19572 }
19573
19574 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19576 }
19577
19578 // The address of the thread local variable is the add of the thread
19579 // pointer with the offset of the variable.
19580 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19581}
19582
19583SDValue
19584X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19585
19586 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19587
19588 if (DAG.getTarget().useEmulatedTLS())
19589 return LowerToTLSEmulatedModel(GA, DAG);
19590
19591 const GlobalValue *GV = GA->getGlobal();
19592 EVT PtrVT = Op.getValueType();
19593 bool PositionIndependent = isPositionIndependent();
19594
19595 if (Subtarget.isTargetELF()) {
19596 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19597 switch (model) {
19599 if (Subtarget.is64Bit()) {
19600 if (Subtarget.isTarget64BitLP64())
19601 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19602 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19603 }
19604 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19606 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19607 Subtarget.isTarget64BitLP64());
19610 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19611 PositionIndependent);
19612 }
19613 llvm_unreachable("Unknown TLS model.");
19614 }
19615
19616 if (Subtarget.isTargetDarwin()) {
19617 // Darwin only has one model of TLS. Lower to that.
19618 unsigned char OpFlag = 0;
19619 unsigned WrapperKind = 0;
19620
19621 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19622 // global base reg.
19623 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19624 if (PIC32) {
19625 OpFlag = X86II::MO_TLVP_PIC_BASE;
19626 WrapperKind = X86ISD::Wrapper;
19627 } else {
19628 OpFlag = X86II::MO_TLVP;
19629 WrapperKind = X86ISD::WrapperRIP;
19630 }
19631 SDLoc DL(Op);
19633 GA->getValueType(0),
19634 GA->getOffset(), OpFlag);
19635 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19636
19637 // With PIC32, the address is actually $g + Offset.
19638 if (PIC32)
19639 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19640 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19641 Offset);
19642
19643 // Lowering the machine isd will make sure everything is in the right
19644 // location.
19645 SDValue Chain = DAG.getEntryNode();
19646 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19647 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19648 SDValue Args[] = { Chain, Offset };
19649 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19650 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19651
19652 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19653 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19654 MFI.setAdjustsStack(true);
19655
19656 // And our return value (tls address) is in the standard call return value
19657 // location.
19658 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19659 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19660 }
19661
19662 if (Subtarget.isOSWindows()) {
19663 // Just use the implicit TLS architecture
19664 // Need to generate something similar to:
19665 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19666 // ; from TEB
19667 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19668 // mov rcx, qword [rdx+rcx*8]
19669 // mov eax, .tls$:tlsvar
19670 // [rax+rcx] contains the address
19671 // Windows 64bit: gs:0x58
19672 // Windows 32bit: fs:__tls_array
19673
19674 SDLoc dl(GA);
19675 SDValue Chain = DAG.getEntryNode();
19676
19677 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19678 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19679 // use its literal value of 0x2C.
19681 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19683
19684 SDValue TlsArray = Subtarget.is64Bit()
19685 ? DAG.getIntPtrConstant(0x58, dl)
19686 : (Subtarget.isTargetWindowsGNU()
19687 ? DAG.getIntPtrConstant(0x2C, dl)
19688 : DAG.getExternalSymbol("_tls_array", PtrVT));
19689
19691 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19692
19693 SDValue res;
19695 res = ThreadPointer;
19696 } else {
19697 // Load the _tls_index variable
19698 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19699 if (Subtarget.is64Bit())
19700 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19701 MachinePointerInfo(), MVT::i32);
19702 else
19703 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19704
19705 const DataLayout &DL = DAG.getDataLayout();
19706 SDValue Scale =
19707 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19708 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19709
19710 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19711 }
19712
19713 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19714
19715 // Get the offset of start of .tls section
19716 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19717 GA->getValueType(0),
19719 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19720
19721 // The address of the thread local variable is the add of the thread
19722 // pointer with the offset of the variable.
19723 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19724 }
19725
19726 llvm_unreachable("TLS not implemented for this target.");
19727}
19728
19730 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19731 const TargetMachine &TM = getTargetMachine();
19732 TLSModel::Model Model = TM.getTLSModel(&GV);
19733 switch (Model) {
19736 // We can include the %fs segment register in addressing modes.
19737 return true;
19740 // These models do not result in %fs relative addresses unless
19741 // TLS descriptior are used.
19742 //
19743 // Even in the case of TLS descriptors we currently have no way to model
19744 // the difference between %fs access and the computations needed for the
19745 // offset and returning `true` for TLS-desc currently duplicates both
19746 // which is detrimental :-/
19747 return false;
19748 }
19749 }
19750 return false;
19751}
19752
19753/// Lower SRA_PARTS and friends, which return two i32 values
19754/// and take a 2 x i32 value to shift plus a shift amount.
19755/// TODO: Can this be moved to general expansion code?
19757 SDValue Lo, Hi;
19758 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19759 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19760}
19761
19762// Try to use a packed vector operation to handle i64 on 32-bit targets when
19763// AVX512DQ is enabled.
19765 SelectionDAG &DAG,
19766 const X86Subtarget &Subtarget) {
19767 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19769 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19770 Op.getOpcode() == ISD::UINT_TO_FP) &&
19771 "Unexpected opcode!");
19772 bool IsStrict = Op->isStrictFPOpcode();
19773 unsigned OpNo = IsStrict ? 1 : 0;
19774 SDValue Src = Op.getOperand(OpNo);
19775 MVT SrcVT = Src.getSimpleValueType();
19776 MVT VT = Op.getSimpleValueType();
19777
19778 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19779 (VT != MVT::f32 && VT != MVT::f64))
19780 return SDValue();
19781
19782 // Pack the i64 into a vector, do the operation and extract.
19783
19784 // Using 256-bit to ensure result is 128-bits for f32 case.
19785 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19786 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19787 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19788
19789 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19790 if (IsStrict) {
19791 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19792 {Op.getOperand(0), InVec});
19793 SDValue Chain = CvtVec.getValue(1);
19794 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19795 DAG.getVectorIdxConstant(0, dl));
19796 return DAG.getMergeValues({Value, Chain}, dl);
19797 }
19798
19799 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19800
19801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19802 DAG.getVectorIdxConstant(0, dl));
19803}
19804
19805// Try to use a packed vector operation to handle i64 on 32-bit targets.
19807 const X86Subtarget &Subtarget) {
19808 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19810 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19811 Op.getOpcode() == ISD::UINT_TO_FP) &&
19812 "Unexpected opcode!");
19813 bool IsStrict = Op->isStrictFPOpcode();
19814 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19815 MVT SrcVT = Src.getSimpleValueType();
19816 MVT VT = Op.getSimpleValueType();
19817
19818 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19819 return SDValue();
19820
19821 // Pack the i64 into a vector, do the operation and extract.
19822
19823 assert(Subtarget.hasFP16() && "Expected FP16");
19824
19825 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19826 if (IsStrict) {
19827 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19828 {Op.getOperand(0), InVec});
19829 SDValue Chain = CvtVec.getValue(1);
19830 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19831 DAG.getVectorIdxConstant(0, dl));
19832 return DAG.getMergeValues({Value, Chain}, dl);
19833 }
19834
19835 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19836
19837 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19838 DAG.getVectorIdxConstant(0, dl));
19839}
19840
19841static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19842 const X86Subtarget &Subtarget) {
19843 switch (Opcode) {
19844 case ISD::SINT_TO_FP:
19845 // TODO: Handle wider types with AVX/AVX512.
19846 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19847 return false;
19848 // CVTDQ2PS or (V)CVTDQ2PD
19849 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19850
19851 case ISD::UINT_TO_FP:
19852 // TODO: Handle wider types and i64 elements.
19853 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19854 return false;
19855 // VCVTUDQ2PS or VCVTUDQ2PD
19856 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19857
19858 default:
19859 return false;
19860 }
19861}
19862
19863/// Given a scalar cast operation that is extracted from a vector, try to
19864/// vectorize the cast op followed by extraction. This will avoid an expensive
19865/// round-trip between XMM and GPR.
19867 SelectionDAG &DAG,
19868 const X86Subtarget &Subtarget) {
19869 // TODO: This could be enhanced to handle smaller integer types by peeking
19870 // through an extend.
19871 SDValue Extract = Cast.getOperand(0);
19872 MVT DestVT = Cast.getSimpleValueType();
19873 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19874 !isa<ConstantSDNode>(Extract.getOperand(1)))
19875 return SDValue();
19876
19877 // See if we have a 128-bit vector cast op for this type of cast.
19878 SDValue VecOp = Extract.getOperand(0);
19879 MVT FromVT = VecOp.getSimpleValueType();
19880 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19881 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19882 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19883 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19884 return SDValue();
19885
19886 // If we are extracting from a non-zero element, first shuffle the source
19887 // vector to allow extracting from element zero.
19888 if (!isNullConstant(Extract.getOperand(1))) {
19889 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19890 Mask[0] = Extract.getConstantOperandVal(1);
19891 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19892 }
19893 // If the source vector is wider than 128-bits, extract the low part. Do not
19894 // create an unnecessarily wide vector cast op.
19895 if (FromVT != Vec128VT)
19896 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19897
19898 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19899 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19900 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19902 DAG.getVectorIdxConstant(0, DL));
19903}
19904
19905/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19906/// try to vectorize the cast ops. This will avoid an expensive round-trip
19907/// between XMM and GPR.
19908static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19909 SelectionDAG &DAG,
19910 const X86Subtarget &Subtarget) {
19911 // TODO: Allow FP_TO_UINT.
19912 SDValue CastToInt = CastToFP.getOperand(0);
19913 MVT VT = CastToFP.getSimpleValueType();
19914 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19915 return SDValue();
19916
19917 MVT IntVT = CastToInt.getSimpleValueType();
19918 SDValue X = CastToInt.getOperand(0);
19919 MVT SrcVT = X.getSimpleValueType();
19920 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19921 return SDValue();
19922
19923 // See if we have 128-bit vector cast instructions for this type of cast.
19924 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19925 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19926 IntVT != MVT::i32)
19927 return SDValue();
19928
19929 unsigned SrcSize = SrcVT.getSizeInBits();
19930 unsigned IntSize = IntVT.getSizeInBits();
19931 unsigned VTSize = VT.getSizeInBits();
19932 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19933 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19934 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19935
19936 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19937 unsigned ToIntOpcode =
19938 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19939 unsigned ToFPOpcode =
19940 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19941
19942 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19943 //
19944 // We are not defining the high elements (for example, zero them) because
19945 // that could nullify any performance advantage that we hoped to gain from
19946 // this vector op hack. We do not expect any adverse effects (like denorm
19947 // penalties) with cast ops.
19948 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19949 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19950 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19951 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19952 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19953}
19954
19956 SelectionDAG &DAG,
19957 const X86Subtarget &Subtarget) {
19958 bool IsStrict = Op->isStrictFPOpcode();
19959 MVT VT = Op->getSimpleValueType(0);
19960 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19961
19962 if (Subtarget.hasDQI()) {
19963 assert(!Subtarget.hasVLX() && "Unexpected features");
19964
19965 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19966 Src.getSimpleValueType() == MVT::v4i64) &&
19967 "Unsupported custom type");
19968
19969 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19970 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19971 "Unexpected VT!");
19972 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19973
19974 // Need to concat with zero vector for strict fp to avoid spurious
19975 // exceptions.
19976 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19977 : DAG.getUNDEF(MVT::v8i64);
19978 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19979 DAG.getVectorIdxConstant(0, DL));
19980 SDValue Res, Chain;
19981 if (IsStrict) {
19982 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19983 {Op->getOperand(0), Src});
19984 Chain = Res.getValue(1);
19985 } else {
19986 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19987 }
19988
19989 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19990 DAG.getVectorIdxConstant(0, DL));
19991
19992 if (IsStrict)
19993 return DAG.getMergeValues({Res, Chain}, DL);
19994 return Res;
19995 }
19996
19997 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19998 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19999 if (VT != MVT::v4f32 || IsSigned)
20000 return SDValue();
20001
20002 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20003 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20004 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20005 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20006 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20007 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20008 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20009 SmallVector<SDValue, 4> SignCvts(4);
20010 SmallVector<SDValue, 4> Chains(4);
20011 for (int i = 0; i != 4; ++i) {
20012 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20013 DAG.getVectorIdxConstant(i, DL));
20014 if (IsStrict) {
20015 SignCvts[i] =
20016 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20017 {Op.getOperand(0), Elt});
20018 Chains[i] = SignCvts[i].getValue(1);
20019 } else {
20020 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20021 }
20022 }
20023 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20024
20025 SDValue Slow, Chain;
20026 if (IsStrict) {
20027 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20028 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20029 {Chain, SignCvt, SignCvt});
20030 Chain = Slow.getValue(1);
20031 } else {
20032 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20033 }
20034
20035 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20036 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20037
20038 if (IsStrict)
20039 return DAG.getMergeValues({Cvt, Chain}, DL);
20040
20041 return Cvt;
20042}
20043
20045 SelectionDAG &DAG) {
20046 bool IsStrict = Op->isStrictFPOpcode();
20047 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20048 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20049 MVT VT = Op.getSimpleValueType();
20050 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20051
20052 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20053 if (IsStrict)
20054 return DAG.getNode(
20055 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20056 {Chain,
20057 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20058 Rnd});
20059 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20060 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20061}
20062
20063static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20064 const X86Subtarget &Subtarget) {
20065 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20066 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20067 return true;
20068 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20069 return true;
20070 }
20071 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20072 return true;
20073 if (Subtarget.useAVX512Regs()) {
20074 if (VT == MVT::v16i32)
20075 return true;
20076 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20077 return true;
20078 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20079 return true;
20080 }
20081 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20082 (VT == MVT::v2i64 || VT == MVT::v4i64))
20083 return true;
20084 return false;
20085}
20086
20087SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20088 SelectionDAG &DAG) const {
20089 bool IsStrict = Op->isStrictFPOpcode();
20090 unsigned OpNo = IsStrict ? 1 : 0;
20091 SDValue Src = Op.getOperand(OpNo);
20092 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20093 MVT SrcVT = Src.getSimpleValueType();
20094 MVT VT = Op.getSimpleValueType();
20095 SDLoc dl(Op);
20096
20097 if (isSoftF16(VT, Subtarget))
20098 return promoteXINT_TO_FP(Op, dl, DAG);
20099 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20100 return Op;
20101
20102 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20103 return LowerWin64_INT128_TO_FP(Op, DAG);
20104
20105 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20106 return Extract;
20107
20108 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20109 return R;
20110
20111 if (SrcVT.isVector()) {
20112 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20113 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20114 // source for strict FP.
20115 if (IsStrict)
20116 return DAG.getNode(
20117 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20118 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20119 DAG.getUNDEF(SrcVT))});
20120 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20121 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20122 DAG.getUNDEF(SrcVT)));
20123 }
20124 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20125 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20126
20127 return SDValue();
20128 }
20129
20130 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20131 "Unknown SINT_TO_FP to lower!");
20132
20133 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20134
20135 // These are really Legal; return the operand so the caller accepts it as
20136 // Legal.
20137 if (SrcVT == MVT::i32 && UseSSEReg)
20138 return Op;
20139 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20140 return Op;
20141
20142 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20143 return V;
20144 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20145 return V;
20146
20147 // SSE doesn't have an i16 conversion so we need to promote.
20148 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20149 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20150 if (IsStrict)
20151 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20152 {Chain, Ext});
20153
20154 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20155 }
20156
20157 if (VT == MVT::f128 || !Subtarget.hasX87())
20158 return SDValue();
20159
20160 SDValue ValueToStore = Src;
20161 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20162 // Bitcasting to f64 here allows us to do a single 64-bit store from
20163 // an SSE register, avoiding the store forwarding penalty that would come
20164 // with two 32-bit stores.
20165 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20166
20167 unsigned Size = SrcVT.getStoreSize();
20168 Align Alignment(Size);
20169 MachineFunction &MF = DAG.getMachineFunction();
20170 auto PtrVT = getPointerTy(MF.getDataLayout());
20171 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20172 MachinePointerInfo MPI =
20174 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20175 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20176 std::pair<SDValue, SDValue> Tmp =
20177 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20178
20179 if (IsStrict)
20180 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20181
20182 return Tmp.first;
20183}
20184
20185std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20186 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20187 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20188 // Build the FILD
20189 SDVTList Tys;
20190 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20191 if (useSSE)
20192 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20193 else
20194 Tys = DAG.getVTList(DstVT, MVT::Other);
20195
20196 SDValue FILDOps[] = {Chain, Pointer};
20197 SDValue Result =
20198 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20199 Alignment, MachineMemOperand::MOLoad);
20200 Chain = Result.getValue(1);
20201
20202 if (useSSE) {
20204 unsigned SSFISize = DstVT.getStoreSize();
20205 int SSFI =
20206 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20207 auto PtrVT = getPointerTy(MF.getDataLayout());
20208 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20209 Tys = DAG.getVTList(MVT::Other);
20210 SDValue FSTOps[] = {Chain, Result, StackSlot};
20213 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20214
20215 Chain =
20216 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20217 Result = DAG.getLoad(
20218 DstVT, DL, Chain, StackSlot,
20220 Chain = Result.getValue(1);
20221 }
20222
20223 return { Result, Chain };
20224}
20225
20226/// Horizontal vector math instructions may be slower than normal math with
20227/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20228/// implementation, and likely shuffle complexity of the alternate sequence.
20229static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 bool IsOptimizingSize = DAG.shouldOptForSize();
20232 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20233 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20234}
20235
20236/// 64-bit unsigned integer to double expansion.
20238 SelectionDAG &DAG,
20239 const X86Subtarget &Subtarget) {
20240 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20241 // when converting 0 when rounding toward negative infinity. Caller will
20242 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20243 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20244 // This algorithm is not obvious. Here it is what we're trying to output:
20245 /*
20246 movq %rax, %xmm0
20247 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20248 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20249 #ifdef __SSE3__
20250 haddpd %xmm0, %xmm0
20251 #else
20252 pshufd $0x4e, %xmm0, %xmm1
20253 addpd %xmm1, %xmm0
20254 #endif
20255 */
20256
20257 LLVMContext *Context = DAG.getContext();
20258
20259 // Build some magic constants.
20260 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20261 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20262 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20263 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20264
20266 CV1.push_back(
20267 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20268 APInt(64, 0x4330000000000000ULL))));
20269 CV1.push_back(
20270 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20271 APInt(64, 0x4530000000000000ULL))));
20272 Constant *C1 = ConstantVector::get(CV1);
20273 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20274
20275 // Load the 64-bit value into an XMM register.
20276 SDValue XR1 =
20277 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20278 SDValue CLod0 = DAG.getLoad(
20279 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20281 SDValue Unpck1 =
20282 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20283
20284 SDValue CLod1 = DAG.getLoad(
20285 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20287 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20288 // TODO: Are there any fast-math-flags to propagate here?
20289 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20290 SDValue Result;
20291
20292 if (Subtarget.hasSSE3() &&
20293 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20294 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20295 } else {
20296 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20297 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20298 }
20299 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20300 DAG.getVectorIdxConstant(0, dl));
20301 return Result;
20302}
20303
20304/// 32-bit unsigned integer to float expansion.
20306 SelectionDAG &DAG,
20307 const X86Subtarget &Subtarget) {
20308 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20309 // FP constant to bias correct the final result.
20310 SDValue Bias = DAG.getConstantFP(
20311 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20312
20313 // Load the 32-bit value into an XMM register.
20314 SDValue Load =
20315 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20316
20317 // Zero out the upper parts of the register.
20318 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20319
20320 // Or the load with the bias.
20321 SDValue Or = DAG.getNode(
20322 ISD::OR, dl, MVT::v2i64,
20323 DAG.getBitcast(MVT::v2i64, Load),
20324 DAG.getBitcast(MVT::v2i64,
20325 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20326 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20327 DAG.getBitcast(MVT::v2f64, Or),
20328 DAG.getVectorIdxConstant(0, dl));
20329
20330 if (Op.getNode()->isStrictFPOpcode()) {
20331 // Subtract the bias.
20332 // TODO: Are there any fast-math-flags to propagate here?
20333 SDValue Chain = Op.getOperand(0);
20334 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20335 {Chain, Or, Bias});
20336
20337 if (Op.getValueType() == Sub.getValueType())
20338 return Sub;
20339
20340 // Handle final rounding.
20341 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20342 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20343
20344 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20345 }
20346
20347 // Subtract the bias.
20348 // TODO: Are there any fast-math-flags to propagate here?
20349 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20350
20351 // Handle final rounding.
20352 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20353}
20354
20356 SelectionDAG &DAG,
20357 const X86Subtarget &Subtarget) {
20358 if (Op.getSimpleValueType() != MVT::v2f64)
20359 return SDValue();
20360
20361 bool IsStrict = Op->isStrictFPOpcode();
20362
20363 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20364 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20365
20366 if (Subtarget.hasAVX512()) {
20367 if (!Subtarget.hasVLX()) {
20368 // Let generic type legalization widen this.
20369 if (!IsStrict)
20370 return SDValue();
20371 // Otherwise pad the integer input with 0s and widen the operation.
20372 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20373 DAG.getConstant(0, DL, MVT::v2i32));
20374 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20375 {Op.getOperand(0), N0});
20376 SDValue Chain = Res.getValue(1);
20377 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20378 DAG.getVectorIdxConstant(0, DL));
20379 return DAG.getMergeValues({Res, Chain}, DL);
20380 }
20381
20382 // Legalize to v4i32 type.
20383 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20384 DAG.getUNDEF(MVT::v2i32));
20385 if (IsStrict)
20386 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20387 {Op.getOperand(0), N0});
20388 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20389 }
20390
20391 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20392 // This gives us the floating point equivalent of 2^52 + the i32 integer
20393 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20394 // point leaving just our i32 integers in double format.
20395 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20396 SDValue VBias = DAG.getConstantFP(
20397 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20398 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20399 DAG.getBitcast(MVT::v2i64, VBias));
20400 Or = DAG.getBitcast(MVT::v2f64, Or);
20401
20402 if (IsStrict)
20403 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20404 {Op.getOperand(0), Or, VBias});
20405 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20406}
20407
20409 SelectionDAG &DAG,
20410 const X86Subtarget &Subtarget) {
20411 bool IsStrict = Op->isStrictFPOpcode();
20412 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20413 MVT VecIntVT = V.getSimpleValueType();
20414 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20415 "Unsupported custom type");
20416
20417 if (Subtarget.hasAVX512()) {
20418 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20419 assert(!Subtarget.hasVLX() && "Unexpected features");
20420 MVT VT = Op->getSimpleValueType(0);
20421
20422 // v8i32->v8f64 is legal with AVX512 so just return it.
20423 if (VT == MVT::v8f64)
20424 return Op;
20425
20426 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20427 VT == MVT::v8f16) &&
20428 "Unexpected VT!");
20429 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20430 MVT WideIntVT = MVT::v16i32;
20431 if (VT == MVT::v4f64) {
20432 WideVT = MVT::v8f64;
20433 WideIntVT = MVT::v8i32;
20434 }
20435
20436 // Need to concat with zero vector for strict fp to avoid spurious
20437 // exceptions.
20438 SDValue Tmp =
20439 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20440 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20441 DAG.getVectorIdxConstant(0, DL));
20442 SDValue Res, Chain;
20443 if (IsStrict) {
20444 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20445 {Op->getOperand(0), V});
20446 Chain = Res.getValue(1);
20447 } else {
20448 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20449 }
20450
20451 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20452 DAG.getVectorIdxConstant(0, DL));
20453
20454 if (IsStrict)
20455 return DAG.getMergeValues({Res, Chain}, DL);
20456 return Res;
20457 }
20458
20459 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20460 Op->getSimpleValueType(0) == MVT::v4f64) {
20461 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20462 Constant *Bias = ConstantFP::get(
20463 *DAG.getContext(),
20464 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20465 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20466 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20467 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20468 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20469 SDValue VBias = DAG.getMemIntrinsicNode(
20470 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20473
20474 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20475 DAG.getBitcast(MVT::v4i64, VBias));
20476 Or = DAG.getBitcast(MVT::v4f64, Or);
20477
20478 if (IsStrict)
20479 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20480 {Op.getOperand(0), Or, VBias});
20481 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20482 }
20483
20484 // The algorithm is the following:
20485 // #ifdef __SSE4_1__
20486 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20487 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20488 // (uint4) 0x53000000, 0xaa);
20489 // #else
20490 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20491 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20492 // #endif
20493 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20494 // return (float4) lo + fhi;
20495
20496 bool Is128 = VecIntVT == MVT::v4i32;
20497 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20498 // If we convert to something else than the supported type, e.g., to v4f64,
20499 // abort early.
20500 if (VecFloatVT != Op->getSimpleValueType(0))
20501 return SDValue();
20502
20503 // In the #idef/#else code, we have in common:
20504 // - The vector of constants:
20505 // -- 0x4b000000
20506 // -- 0x53000000
20507 // - A shift:
20508 // -- v >> 16
20509
20510 // Create the splat vector for 0x4b000000.
20511 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20512 // Create the splat vector for 0x53000000.
20513 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20514
20515 // Create the right shift.
20516 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20517 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20518
20519 SDValue Low, High;
20520 if (Subtarget.hasSSE41()) {
20521 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20522 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20523 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20524 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20525 // Low will be bitcasted right away, so do not bother bitcasting back to its
20526 // original type.
20527 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20528 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20529 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20530 // (uint4) 0x53000000, 0xaa);
20531 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20532 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20533 // High will be bitcasted right away, so do not bother bitcasting back to
20534 // its original type.
20535 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20536 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20537 } else {
20538 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20539 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20540 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20541 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20542
20543 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20544 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20545 }
20546
20547 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20548 SDValue VecCstFSub = DAG.getConstantFP(
20549 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20550
20551 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20552 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20553 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20554 // enabled. See PR24512.
20555 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20556 // TODO: Are there any fast-math-flags to propagate here?
20557 // (float4) lo;
20558 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20559 // return (float4) lo + fhi;
20560 if (IsStrict) {
20561 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20562 {Op.getOperand(0), HighBitcast, VecCstFSub});
20563 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20564 {FHigh.getValue(1), LowBitcast, FHigh});
20565 }
20566
20567 SDValue FHigh =
20568 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20569 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20570}
20571
20573 const X86Subtarget &Subtarget) {
20574 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20575 SDValue N0 = Op.getOperand(OpNo);
20576 MVT SrcVT = N0.getSimpleValueType();
20577
20578 switch (SrcVT.SimpleTy) {
20579 default:
20580 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20581 case MVT::v2i32:
20582 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20583 case MVT::v4i32:
20584 case MVT::v8i32:
20585 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20586 case MVT::v2i64:
20587 case MVT::v4i64:
20588 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20589 }
20590}
20591
20592SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20593 SelectionDAG &DAG) const {
20594 bool IsStrict = Op->isStrictFPOpcode();
20595 unsigned OpNo = IsStrict ? 1 : 0;
20596 SDValue Src = Op.getOperand(OpNo);
20597 SDLoc dl(Op);
20598 auto PtrVT = getPointerTy(DAG.getDataLayout());
20599 MVT SrcVT = Src.getSimpleValueType();
20600 MVT DstVT = Op->getSimpleValueType(0);
20601 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20602
20603 // Bail out when we don't have native conversion instructions.
20604 if (DstVT == MVT::f128)
20605 return SDValue();
20606
20607 if (isSoftF16(DstVT, Subtarget))
20608 return promoteXINT_TO_FP(Op, dl, DAG);
20609 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20610 return Op;
20611
20612 if (DstVT.isVector())
20613 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20614
20615 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20616 return LowerWin64_INT128_TO_FP(Op, DAG);
20617
20618 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20619 return Extract;
20620
20621 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20622 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20623 // Conversions from unsigned i32 to f32/f64 are legal,
20624 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20625 return Op;
20626 }
20627
20628 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20629 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20630 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20631 if (IsStrict)
20632 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20633 {Chain, Src});
20634 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20635 }
20636
20637 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20638 return V;
20639 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20640 return V;
20641
20642 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20643 // infinity. It produces -0.0, so disable under strictfp.
20644 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20645 !IsStrict)
20646 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20647 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20648 // negative infinity. So disable under strictfp. Using FILD instead.
20649 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20650 !IsStrict)
20651 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20652 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20653 (DstVT == MVT::f32 || DstVT == MVT::f64))
20654 return SDValue();
20655
20656 // Make a 64-bit buffer, and use it to build an FILD.
20657 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20658 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20659 Align SlotAlign(8);
20660 MachinePointerInfo MPI =
20662 if (SrcVT == MVT::i32) {
20663 SDValue OffsetSlot =
20664 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20665 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20666 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20667 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20668 std::pair<SDValue, SDValue> Tmp =
20669 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20670 if (IsStrict)
20671 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20672
20673 return Tmp.first;
20674 }
20675
20676 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20677 SDValue ValueToStore = Src;
20678 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20679 // Bitcasting to f64 here allows us to do a single 64-bit store from
20680 // an SSE register, avoiding the store forwarding penalty that would come
20681 // with two 32-bit stores.
20682 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20683 }
20684 SDValue Store =
20685 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20686 // For i64 source, we need to add the appropriate power of 2 if the input
20687 // was negative. We must be careful to do the computation in x87 extended
20688 // precision, not in SSE.
20689 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20690 SDValue Ops[] = {Store, StackSlot};
20691 SDValue Fild =
20692 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20693 SlotAlign, MachineMemOperand::MOLoad);
20694 Chain = Fild.getValue(1);
20695
20696 // Check whether the sign bit is set.
20697 SDValue SignSet = DAG.getSetCC(
20698 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20699 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20700
20701 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20702 APInt FF(64, 0x5F80000000000000ULL);
20703 SDValue FudgePtr =
20704 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20705 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20706
20707 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20708 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20709 SDValue Four = DAG.getIntPtrConstant(4, dl);
20710 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20711 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20712
20713 // Load the value out, extending it from f32 to f80.
20714 SDValue Fudge = DAG.getExtLoad(
20715 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20717 CPAlignment);
20718 Chain = Fudge.getValue(1);
20719 // Extend everything to 80 bits to force it to be done on x87.
20720 // TODO: Are there any fast-math-flags to propagate here?
20721 if (IsStrict) {
20722 unsigned Opc = ISD::STRICT_FADD;
20723 // Windows needs the precision control changed to 80bits around this add.
20724 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20726
20727 SDValue Add =
20728 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20729 // STRICT_FP_ROUND can't handle equal types.
20730 if (DstVT == MVT::f80)
20731 return Add;
20732 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20733 {Add.getValue(1), Add,
20734 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20735 }
20736 unsigned Opc = ISD::FADD;
20737 // Windows needs the precision control changed to 80bits around this add.
20738 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20740
20741 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20742 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20743 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20744}
20745
20746// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20747// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20748// just return an SDValue().
20749// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20750// to i16, i32 or i64, and we lower it to a legal sequence and return the
20751// result.
20752SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20753 bool IsSigned,
20754 SDValue &Chain) const {
20755 bool IsStrict = Op->isStrictFPOpcode();
20756 SDLoc DL(Op);
20757
20758 EVT DstTy = Op.getValueType();
20759 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20760 EVT TheVT = Value.getValueType();
20761 auto PtrVT = getPointerTy(DAG.getDataLayout());
20762
20763 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20764 // f16 must be promoted before using the lowering in this routine.
20765 // fp128 does not use this lowering.
20766 return SDValue();
20767 }
20768
20769 // If using FIST to compute an unsigned i64, we'll need some fixup
20770 // to handle values above the maximum signed i64. A FIST is always
20771 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20772 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20773
20774 // FIXME: This does not generate an invalid exception if the input does not
20775 // fit in i32. PR44019
20776 if (!IsSigned && DstTy != MVT::i64) {
20777 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20778 // The low 32 bits of the fist result will have the correct uint32 result.
20779 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20780 DstTy = MVT::i64;
20781 }
20782
20783 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20784 DstTy.getSimpleVT() >= MVT::i16 &&
20785 "Unknown FP_TO_INT to lower!");
20786
20787 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20788 // stack slot.
20789 MachineFunction &MF = DAG.getMachineFunction();
20790 unsigned MemSize = DstTy.getStoreSize();
20791 int SSFI =
20792 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20793 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20794
20795 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20796
20797 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20798
20799 if (UnsignedFixup) {
20800 //
20801 // Conversion to unsigned i64 is implemented with a select,
20802 // depending on whether the source value fits in the range
20803 // of a signed i64. Let Thresh be the FP equivalent of
20804 // 0x8000000000000000ULL.
20805 //
20806 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20808 // FistSrc = (Value - FltOfs);
20809 // Fist-to-mem64 FistSrc
20810 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20811 // to XOR'ing the high 32 bits with Adjust.
20812 //
20813 // Being a power of 2, Thresh is exactly representable in all FP formats.
20814 // For X87 we'd like to use the smallest FP type for this constant, but
20815 // for DAG type consistency we have to match the FP operand type.
20816
20817 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20819 bool LosesInfo = false;
20820 if (TheVT == MVT::f64)
20821 // The rounding mode is irrelevant as the conversion should be exact.
20822 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20823 &LosesInfo);
20824 else if (TheVT == MVT::f80)
20825 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20826 APFloat::rmNearestTiesToEven, &LosesInfo);
20827
20828 assert(Status == APFloat::opOK && !LosesInfo &&
20829 "FP conversion should have been exact");
20830
20831 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20832
20833 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20834 *DAG.getContext(), TheVT);
20835 SDValue Cmp;
20836 if (IsStrict) {
20837 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20838 /*IsSignaling*/ true);
20839 Chain = Cmp.getValue(1);
20840 } else {
20841 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20842 }
20843
20844 // Our preferred lowering of
20845 //
20846 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20847 //
20848 // is
20849 //
20850 // (Value >= Thresh) << 63
20851 //
20852 // but since we can get here after LegalOperations, DAGCombine might do the
20853 // wrong thing if we create a select. So, directly create the preferred
20854 // version.
20855 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20856 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20857 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20858
20859 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20860 DAG.getConstantFP(0.0, DL, TheVT));
20861
20862 if (IsStrict) {
20863 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20864 { Chain, Value, FltOfs });
20865 Chain = Value.getValue(1);
20866 } else
20867 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20868 }
20869
20870 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20871
20872 // FIXME This causes a redundant load/store if the SSE-class value is already
20873 // in memory, such as if it is on the callstack.
20874 if (isScalarFPTypeInSSEReg(TheVT)) {
20875 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20876 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20877 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20878 SDValue Ops[] = { Chain, StackSlot };
20879
20880 unsigned FLDSize = TheVT.getStoreSize();
20881 assert(FLDSize <= MemSize && "Stack slot not big enough");
20882 MachineMemOperand *MMO = MF.getMachineMemOperand(
20883 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20884 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20885 Chain = Value.getValue(1);
20886 }
20887
20888 // Build the FP_TO_INT*_IN_MEM
20889 MachineMemOperand *MMO = MF.getMachineMemOperand(
20890 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20891 SDValue Ops[] = { Chain, Value, StackSlot };
20893 DAG.getVTList(MVT::Other),
20894 Ops, DstTy, MMO);
20895
20896 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20897 Chain = Res.getValue(1);
20898
20899 // If we need an unsigned fixup, XOR the result with adjust.
20900 if (UnsignedFixup)
20901 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20902
20903 return Res;
20904}
20905
20907 const X86Subtarget &Subtarget) {
20908 MVT VT = Op.getSimpleValueType();
20909 SDValue In = Op.getOperand(0);
20910 MVT InVT = In.getSimpleValueType();
20911 unsigned Opc = Op.getOpcode();
20912
20913 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20915 "Unexpected extension opcode");
20917 "Expected same number of elements");
20918 assert((VT.getVectorElementType() == MVT::i16 ||
20919 VT.getVectorElementType() == MVT::i32 ||
20920 VT.getVectorElementType() == MVT::i64) &&
20921 "Unexpected element type");
20922 assert((InVT.getVectorElementType() == MVT::i8 ||
20923 InVT.getVectorElementType() == MVT::i16 ||
20924 InVT.getVectorElementType() == MVT::i32) &&
20925 "Unexpected element type");
20926
20927 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20928
20929 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20930 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20931 return splitVectorIntUnary(Op, DAG, dl);
20932 }
20933
20934 if (Subtarget.hasInt256())
20935 return Op;
20936
20937 // Optimize vectors in AVX mode:
20938 //
20939 // v8i16 -> v8i32
20940 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20941 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20942 // Concat upper and lower parts.
20943 //
20944 // v4i32 -> v4i64
20945 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20946 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20947 // Concat upper and lower parts.
20948 //
20949 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20950 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20951
20952 // Short-circuit if we can determine that each 128-bit half is the same value.
20953 // Otherwise, this is difficult to match and optimize.
20954 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20955 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20956 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20957
20958 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20959 SDValue Undef = DAG.getUNDEF(InVT);
20960 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20961 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20962 OpHi = DAG.getBitcast(HalfVT, OpHi);
20963
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20965}
20966
20967// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20968static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20969 const SDLoc &dl, SelectionDAG &DAG) {
20970 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20971 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20972 DAG.getVectorIdxConstant(0, dl));
20973 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20974 DAG.getVectorIdxConstant(8, dl));
20975 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20976 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20977 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20978 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20979}
20980
20982 const X86Subtarget &Subtarget,
20983 SelectionDAG &DAG) {
20984 MVT VT = Op->getSimpleValueType(0);
20985 SDValue In = Op->getOperand(0);
20986 MVT InVT = In.getSimpleValueType();
20987 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20988 unsigned NumElts = VT.getVectorNumElements();
20989
20990 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20991 // avoids a constant pool load.
20992 if (VT.getVectorElementType() != MVT::i8) {
20993 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20994 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20995 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20996 }
20997
20998 // Extend VT if BWI is not supported.
20999 MVT ExtVT = VT;
21000 if (!Subtarget.hasBWI()) {
21001 // If v16i32 is to be avoided, we'll need to split and concatenate.
21002 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21003 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21004
21005 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21006 }
21007
21008 // Widen to 512-bits if VLX is not supported.
21009 MVT WideVT = ExtVT;
21010 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21011 NumElts *= 512 / ExtVT.getSizeInBits();
21012 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21013 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21014 DAG.getVectorIdxConstant(0, DL));
21015 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21016 }
21017
21018 SDValue One = DAG.getConstant(1, DL, WideVT);
21019 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21020
21021 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21022
21023 // Truncate if we had to extend above.
21024 if (VT != ExtVT) {
21025 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21026 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21027 }
21028
21029 // Extract back to 128/256-bit if we widened.
21030 if (WideVT != VT)
21031 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21032 DAG.getVectorIdxConstant(0, DL));
21033
21034 return SelectedVal;
21035}
21036
21038 SelectionDAG &DAG) {
21039 SDValue In = Op.getOperand(0);
21040 MVT SVT = In.getSimpleValueType();
21041 SDLoc DL(Op);
21042
21043 if (SVT.getVectorElementType() == MVT::i1)
21044 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21045
21046 assert(Subtarget.hasAVX() && "Expected AVX support");
21047 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21048}
21049
21050/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21051/// It makes use of the fact that vectors with enough leading sign/zero bits
21052/// prevent the PACKSS/PACKUS from saturating the results.
21053/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21054/// within each 128-bit lane.
21055static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21056 const SDLoc &DL, SelectionDAG &DAG,
21057 const X86Subtarget &Subtarget) {
21058 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21059 "Unexpected PACK opcode");
21060 assert(DstVT.isVector() && "VT not a vector?");
21061
21062 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21063 if (!Subtarget.hasSSE2())
21064 return SDValue();
21065
21066 EVT SrcVT = In.getValueType();
21067
21068 // No truncation required, we might get here due to recursive calls.
21069 if (SrcVT == DstVT)
21070 return In;
21071
21072 unsigned NumElems = SrcVT.getVectorNumElements();
21073 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21074 return SDValue();
21075
21076 unsigned DstSizeInBits = DstVT.getSizeInBits();
21077 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21078 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21079 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21080
21081 LLVMContext &Ctx = *DAG.getContext();
21082 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21083 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21084
21085 // Pack to the largest type possible:
21086 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21087 EVT InVT = MVT::i16, OutVT = MVT::i8;
21088 if (SrcVT.getScalarSizeInBits() > 16 &&
21089 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21090 InVT = MVT::i32;
21091 OutVT = MVT::i16;
21092 }
21093
21094 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21095 // On pre-AVX512, pack the src in both halves to help value tracking.
21096 if (SrcSizeInBits <= 128) {
21097 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21098 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21099 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21100 SDValue LHS = DAG.getBitcast(InVT, In);
21101 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21103 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21104 Res = DAG.getBitcast(PackedVT, Res);
21105 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21106 }
21107
21108 // Split lower/upper subvectors.
21109 SDValue Lo, Hi;
21110 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21111
21112 // If Hi is undef, then don't bother packing it and widen the result instead.
21113 if (Hi.isUndef()) {
21114 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21115 if (SDValue Res =
21116 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21117 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21118 }
21119
21120 unsigned SubSizeInBits = SrcSizeInBits / 2;
21121 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21122 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21123
21124 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21125 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21126 Lo = DAG.getBitcast(InVT, Lo);
21127 Hi = DAG.getBitcast(InVT, Hi);
21128 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21129 return DAG.getBitcast(DstVT, Res);
21130 }
21131
21132 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21133 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21134 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21135 Lo = DAG.getBitcast(InVT, Lo);
21136 Hi = DAG.getBitcast(InVT, Hi);
21137 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21138
21139 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21140 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21141 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21143 int Scale = 64 / OutVT.getScalarSizeInBits();
21144 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21145 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21146
21147 if (DstVT.is256BitVector())
21148 return DAG.getBitcast(DstVT, Res);
21149
21150 // If 512bit -> 128bit truncate another stage.
21151 Res = DAG.getBitcast(PackedVT, Res);
21152 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21153 }
21154
21155 // Recursively pack lower/upper subvectors, concat result and pack again.
21156 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21157
21158 if (PackedVT.is128BitVector()) {
21159 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21160 // type legalization.
21161 SDValue Res =
21162 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21163 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21164 }
21165
21166 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21167 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21168 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21169 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21170 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21171}
21172
21173/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21174/// e.g. trunc <8 x i32> X to <8 x i16> -->
21175/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21176/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21178 const X86Subtarget &Subtarget,
21179 SelectionDAG &DAG) {
21180 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21181 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21182}
21183
21184/// Truncate using inreg sign extension and X86ISD::PACKSS.
21186 const X86Subtarget &Subtarget,
21187 SelectionDAG &DAG) {
21188 EVT SrcVT = In.getValueType();
21189 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21190 DAG.getValueType(DstVT));
21191 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21192}
21193
21194/// Helper to determine if \p In truncated to \p DstVT has the necessary
21195/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21196/// possibly by converting a SRL node to SRA for sign extension.
21197static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21198 SDValue In, const SDLoc &DL,
21199 SelectionDAG &DAG,
21200 const X86Subtarget &Subtarget,
21201 const SDNodeFlags Flags = SDNodeFlags()) {
21202 // Requires SSE2.
21203 if (!Subtarget.hasSSE2())
21204 return SDValue();
21205
21206 EVT SrcVT = In.getValueType();
21207 EVT DstSVT = DstVT.getVectorElementType();
21208 EVT SrcSVT = SrcVT.getVectorElementType();
21209 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21210 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21211
21212 // Check we have a truncation suited for PACKSS/PACKUS.
21213 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21214 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21215 return SDValue();
21216
21217 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21218 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21219
21220 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21221 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21222 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21223 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21224 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21225 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21226 return SDValue();
21227
21228 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21229 // split this for packing.
21230 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21231 !isFreeToSplitVector(In, DAG) &&
21232 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21233 return SDValue();
21234
21235 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21236 if (Subtarget.hasAVX512() && NumStages > 1)
21237 return SDValue();
21238
21239 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21240 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21241
21242 // Truncate with PACKUS if we are truncating a vector with leading zero
21243 // bits that extend all the way to the packed/truncated value.
21244 // e.g. Masks, zext_in_reg, etc.
21245 // Pre-SSE41 we can only use PACKUSWB.
21246 KnownBits Known = DAG.computeKnownBits(In);
21247 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21248 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21249 PackOpcode = X86ISD::PACKUS;
21250 return In;
21251 }
21252
21253 // Truncate with PACKSS if we are truncating a vector with sign-bits
21254 // that extend all the way to the packed/truncated value.
21255 // e.g. Comparison result, sext_in_reg, etc.
21256 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21257
21258 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21259 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21260 // see through BITCASTs later on and combines/simplifications can't then use
21261 // it.
21262 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21263 !Subtarget.hasAVX512())
21264 return SDValue();
21265
21266 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21267 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21268 MinSignBits < NumSignBits) {
21269 PackOpcode = X86ISD::PACKSS;
21270 return In;
21271 }
21272
21273 // If we have a srl that only generates signbits that we will discard in
21274 // the truncation then we can use PACKSS by converting the srl to a sra.
21275 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21276 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21277 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21278 if (*ShAmt == MinSignBits) {
21279 PackOpcode = X86ISD::PACKSS;
21280 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21281 }
21282 }
21283
21284 return SDValue();
21285}
21286
21287/// This function lowers a vector truncation of 'extended sign-bits' or
21288/// 'extended zero-bits' values.
21289/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21291 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21292 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21293 MVT SrcVT = In.getSimpleValueType();
21294 MVT DstSVT = DstVT.getVectorElementType();
21295 MVT SrcSVT = SrcVT.getVectorElementType();
21296 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21297 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21298 return SDValue();
21299
21300 // If the upper half of the source is undef, then attempt to split and
21301 // only truncate the lower half.
21302 if (DstVT.getSizeInBits() >= 128) {
21303 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21304 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21305 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21306 Subtarget, DAG))
21307 return widenSubVector(Res, false, Subtarget, DAG, DL,
21308 DstVT.getSizeInBits());
21309 }
21310 }
21311
21312 unsigned PackOpcode;
21313 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21314 Subtarget, Flags))
21315 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21316
21317 return SDValue();
21318}
21319
21320/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21321/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21323 const X86Subtarget &Subtarget,
21324 SelectionDAG &DAG) {
21325 MVT SrcVT = In.getSimpleValueType();
21326 MVT DstSVT = DstVT.getVectorElementType();
21327 MVT SrcSVT = SrcVT.getVectorElementType();
21328 unsigned NumElems = DstVT.getVectorNumElements();
21329 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21330 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21331 NumElems >= 8))
21332 return SDValue();
21333
21334 // SSSE3's pshufb results in less instructions in the cases below.
21335 if (Subtarget.hasSSSE3() && NumElems == 8) {
21336 if (SrcSVT == MVT::i16)
21337 return SDValue();
21338 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21339 return SDValue();
21340 }
21341
21342 // If the upper half of the source is undef, then attempt to split and
21343 // only truncate the lower half.
21344 if (DstVT.getSizeInBits() >= 128) {
21345 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21346 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21347 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21348 return widenSubVector(Res, false, Subtarget, DAG, DL,
21349 DstVT.getSizeInBits());
21350 }
21351 }
21352
21353 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21354 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21355 // truncate 2 x v4i32 to v8i16.
21356 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21357 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21358
21359 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21360 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21361
21362 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21363 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21364 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21365 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21366 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21367 }
21368
21369 return SDValue();
21370}
21371
21373 SelectionDAG &DAG,
21374 const X86Subtarget &Subtarget) {
21375 MVT VT = Op.getSimpleValueType();
21376 SDValue In = Op.getOperand(0);
21377 MVT InVT = In.getSimpleValueType();
21378 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21379
21380 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21381 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21382 if (InVT.getScalarSizeInBits() <= 16) {
21383 if (Subtarget.hasBWI()) {
21384 // legal, will go to VPMOVB2M, VPMOVW2M
21385 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21386 // We need to shift to get the lsb into sign position.
21387 // Shift packed bytes not supported natively, bitcast to word
21388 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21389 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21390 DAG.getBitcast(ExtVT, In),
21391 DAG.getConstant(ShiftInx, DL, ExtVT));
21392 In = DAG.getBitcast(InVT, In);
21393 }
21394 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21395 In, ISD::SETGT);
21396 }
21397 // Use TESTD/Q, extended vector to packed dword/qword.
21398 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21399 "Unexpected vector type.");
21400 unsigned NumElts = InVT.getVectorNumElements();
21401 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21402 // We need to change to a wider element type that we have support for.
21403 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21404 // For 16 element vectors we extend to v16i32 unless we are explicitly
21405 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21406 // we need to split into two 8 element vectors which we can extend to v8i32,
21407 // truncate and concat the results. There's an additional complication if
21408 // the original type is v16i8. In that case we can't split the v16i8
21409 // directly, so we need to shuffle high elements to low and use
21410 // sign_extend_vector_inreg.
21411 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21412 SDValue Lo, Hi;
21413 if (InVT == MVT::v16i8) {
21414 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21415 Hi = DAG.getVectorShuffle(
21416 InVT, DL, In, In,
21417 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21418 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21419 } else {
21420 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21421 Lo = extract128BitVector(In, 0, DAG, DL);
21422 Hi = extract128BitVector(In, 8, DAG, DL);
21423 }
21424 // We're split now, just emit two truncates and a concat. The two
21425 // truncates will trigger legalization to come back to this function.
21426 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21427 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21428 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21429 }
21430 // We either have 8 elements or we're allowed to use 512-bit vectors.
21431 // If we have VLX, we want to use the narrowest vector that can get the
21432 // job done so we use vXi32.
21433 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21434 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21435 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21436 InVT = ExtVT;
21437 ShiftInx = InVT.getScalarSizeInBits() - 1;
21438 }
21439
21440 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21441 // We need to shift to get the lsb into sign position.
21442 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21443 DAG.getConstant(ShiftInx, DL, InVT));
21444 }
21445 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21446 if (Subtarget.hasDQI())
21447 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21448 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21449}
21450
21451SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21452 SDLoc DL(Op);
21453 MVT VT = Op.getSimpleValueType();
21454 SDValue In = Op.getOperand(0);
21455 MVT InVT = In.getSimpleValueType();
21457 "Invalid TRUNCATE operation");
21458
21459 // If we're called by the type legalizer, handle a few cases.
21460 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21461 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21462 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21463 VT.is128BitVector() && Subtarget.hasAVX512()) {
21464 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21465 "Unexpected subtarget!");
21466 // The default behavior is to truncate one step, concatenate, and then
21467 // truncate the remainder. We'd rather produce two 64-bit results and
21468 // concatenate those.
21469 SDValue Lo, Hi;
21470 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21471
21472 EVT LoVT, HiVT;
21473 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21474
21475 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21476 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21478 }
21479
21480 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21481 if (!Subtarget.hasAVX512() ||
21482 (InVT.is512BitVector() && VT.is256BitVector()))
21484 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21485 return SignPack;
21486
21487 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21488 if (!Subtarget.hasAVX512())
21489 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21490
21491 // Otherwise let default legalization handle it.
21492 return SDValue();
21493 }
21494
21495 if (VT.getVectorElementType() == MVT::i1)
21496 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21497
21498 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21499 // concat from subvectors to use VPTRUNC etc.
21500 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21502 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21503 return SignPack;
21504
21505 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21506 if (Subtarget.hasAVX512()) {
21507 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21508 assert(VT == MVT::v32i8 && "Unexpected VT!");
21509 return splitVectorIntUnary(Op, DAG, DL);
21510 }
21511
21512 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21513 // and then truncate that. But we should only do that if we haven't been
21514 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21515 // handled by isel patterns.
21516 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21517 Subtarget.canExtendTo512DQ())
21518 return Op;
21519 }
21520
21521 // Handle truncation of V256 to V128 using shuffles.
21522 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21523
21524 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21525 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21526 if (Subtarget.hasInt256()) {
21527 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21528 In = DAG.getBitcast(MVT::v8i32, In);
21529 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21530 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21531 DAG.getVectorIdxConstant(0, DL));
21532 }
21533
21534 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21535 DAG.getVectorIdxConstant(0, DL));
21536 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21537 DAG.getVectorIdxConstant(2, DL));
21538 static const int ShufMask[] = {0, 2, 4, 6};
21539 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21540 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21541 }
21542
21543 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21544 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21545 if (Subtarget.hasInt256()) {
21546 // The PSHUFB mask:
21547 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21548 -1, -1, -1, -1, -1, -1, -1, -1,
21549 16, 17, 20, 21, 24, 25, 28, 29,
21550 -1, -1, -1, -1, -1, -1, -1, -1 };
21551 In = DAG.getBitcast(MVT::v32i8, In);
21552 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21553 In = DAG.getBitcast(MVT::v4i64, In);
21554
21555 static const int ShufMask2[] = {0, 2, -1, -1};
21556 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21557 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21558 DAG.getVectorIdxConstant(0, DL));
21559 return DAG.getBitcast(MVT::v8i16, In);
21560 }
21561
21562 return Subtarget.hasSSE41()
21563 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21564 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21565 }
21566
21567 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21568 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21569
21570 llvm_unreachable("All 256->128 cases should have been handled above!");
21571}
21572
21573// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21574// behaves on out of range inputs to generate optimized conversions.
21576 SelectionDAG &DAG,
21577 const X86Subtarget &Subtarget) {
21578 MVT SrcVT = Src.getSimpleValueType();
21579 unsigned DstBits = VT.getScalarSizeInBits();
21580 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21581
21582 // Calculate the converted result for values in the range 0 to
21583 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21584 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21585 SDValue Big =
21586 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21587 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21588 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21589
21590 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21591 // and only if the value was out of range. So we can use that
21592 // as our indicator that we rather use "Big" instead of "Small".
21593 //
21594 // Use "Small" if "IsOverflown" has all bits cleared
21595 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21596
21597 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21598 // use the slightly slower blendv select instead.
21599 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21600 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21601 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21602 }
21603
21604 SDValue IsOverflown =
21605 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21606 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21607 return DAG.getNode(ISD::OR, dl, VT, Small,
21608 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21609}
21610
21611SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21612 bool IsStrict = Op->isStrictFPOpcode();
21613 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21614 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21615 bool HasVLX = Subtarget.hasVLX();
21616 MVT VT = Op->getSimpleValueType(0);
21617 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21618 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21619 MVT SrcVT = Src.getSimpleValueType();
21620 SDLoc dl(Op);
21621
21622 SDValue Res;
21623 if (isSoftF16(SrcVT, Subtarget)) {
21624 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21625 if (IsStrict)
21626 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21628 {NVT, MVT::Other}, {Chain, Src})});
21629 return DAG.getNode(Op.getOpcode(), dl, VT,
21630 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21631 } else if (isTypeLegal(SrcVT) &&
21632 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21633 return Op;
21634 }
21635
21636 if (VT.isVector()) {
21637 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21638 MVT ResVT = MVT::v4i32;
21639 MVT TruncVT = MVT::v4i1;
21640 unsigned Opc;
21641 if (IsStrict)
21643 else
21644 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21645
21646 if (!IsSigned && !HasVLX) {
21647 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21648 // Widen to 512-bits.
21649 ResVT = MVT::v8i32;
21650 TruncVT = MVT::v8i1;
21651 Opc = Op.getOpcode();
21652 // Need to concat with zero vector for strict fp to avoid spurious
21653 // exceptions.
21654 // TODO: Should we just do this for non-strict as well?
21655 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21656 : DAG.getUNDEF(MVT::v8f64);
21657 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21658 DAG.getVectorIdxConstant(0, dl));
21659 }
21660 if (IsStrict) {
21661 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21662 Chain = Res.getValue(1);
21663 } else {
21664 Res = DAG.getNode(Opc, dl, ResVT, Src);
21665 }
21666
21667 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21669 DAG.getVectorIdxConstant(0, dl));
21670 if (IsStrict)
21671 return DAG.getMergeValues({Res, Chain}, dl);
21672 return Res;
21673 }
21674
21675 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21676 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21677 VT == MVT::v32i16)
21678 return Op;
21679
21680 MVT ResVT = VT;
21681 MVT EleVT = VT.getVectorElementType();
21682 if (EleVT != MVT::i64)
21683 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21684
21685 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21686 SDValue Tmp =
21687 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21688 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21689 Ops[0] = Src;
21690 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21691 }
21692
21693 if (!HasVLX) {
21694 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21695 // Widen to 512-bits.
21696 unsigned IntSize = EleVT.getSizeInBits();
21697 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21698 ResVT = MVT::getVectorVT(EleVT, Num);
21699 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21700 Subtarget, DAG, dl);
21701 }
21702
21703 if (IsStrict) {
21704 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21706 dl, {ResVT, MVT::Other}, {Chain, Src});
21707 Chain = Res.getValue(1);
21708 } else {
21709 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21710 ResVT, Src);
21711 }
21712
21713 // TODO: Need to add exception check code for strict FP.
21714 if (EleVT.getSizeInBits() < 16) {
21715 if (HasVLX)
21716 ResVT = MVT::getVectorVT(EleVT, 8);
21717 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21718 }
21719
21720 if (ResVT != VT)
21721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21722 DAG.getVectorIdxConstant(0, dl));
21723
21724 if (IsStrict)
21725 return DAG.getMergeValues({Res, Chain}, dl);
21726 return Res;
21727 }
21728
21729 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21730 if (VT.getVectorElementType() == MVT::i16) {
21731 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21732 SrcVT.getVectorElementType() == MVT::f64) &&
21733 "Expected f32/f64 vector!");
21734 MVT NVT = VT.changeVectorElementType(MVT::i32);
21735 if (IsStrict) {
21736 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21738 dl, {NVT, MVT::Other}, {Chain, Src});
21739 Chain = Res.getValue(1);
21740 } else {
21741 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21742 NVT, Src);
21743 }
21744
21745 // TODO: Need to add exception check code for strict FP.
21746 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21747
21748 if (IsStrict)
21749 return DAG.getMergeValues({Res, Chain}, dl);
21750 return Res;
21751 }
21752
21753 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21754 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21755 assert(!IsSigned && "Expected unsigned conversion!");
21756 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21757 return Op;
21758 }
21759
21760 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21761 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21762 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21763 Subtarget.useAVX512Regs()) {
21764 assert(!IsSigned && "Expected unsigned conversion!");
21765 assert(!Subtarget.hasVLX() && "Unexpected features!");
21766 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21767 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21768 // Need to concat with zero vector for strict fp to avoid spurious
21769 // exceptions.
21770 // TODO: Should we just do this for non-strict as well?
21771 SDValue Tmp =
21772 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21773 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21774 DAG.getVectorIdxConstant(0, dl));
21775
21776 if (IsStrict) {
21777 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21778 {Chain, Src});
21779 Chain = Res.getValue(1);
21780 } else {
21781 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21782 }
21783
21784 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21785 DAG.getVectorIdxConstant(0, dl));
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Res, Chain}, dl);
21789 return Res;
21790 }
21791
21792 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21793 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21794 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21795 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21796 assert(!Subtarget.hasVLX() && "Unexpected features!");
21797 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21798 // Need to concat with zero vector for strict fp to avoid spurious
21799 // exceptions.
21800 // TODO: Should we just do this for non-strict as well?
21801 SDValue Tmp =
21802 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21803 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21804 DAG.getVectorIdxConstant(0, dl));
21805
21806 if (IsStrict) {
21807 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21808 {Chain, Src});
21809 Chain = Res.getValue(1);
21810 } else {
21811 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21812 }
21813
21814 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21815 DAG.getVectorIdxConstant(0, dl));
21816
21817 if (IsStrict)
21818 return DAG.getMergeValues({Res, Chain}, dl);
21819 return Res;
21820 }
21821
21822 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21823 if (!Subtarget.hasVLX()) {
21824 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21825 // legalizer and then widened again by vector op legalization.
21826 if (!IsStrict)
21827 return SDValue();
21828
21829 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21830 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21831 {Src, Zero, Zero, Zero});
21832 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21833 {Chain, Tmp});
21834 SDValue Chain = Tmp.getValue(1);
21835 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21836 DAG.getVectorIdxConstant(0, dl));
21837 return DAG.getMergeValues({Tmp, Chain}, dl);
21838 }
21839
21840 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21841 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21842 DAG.getUNDEF(MVT::v2f32));
21843 if (IsStrict) {
21844 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21846 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21847 }
21848 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21849 return DAG.getNode(Opc, dl, VT, Tmp);
21850 }
21851
21852 // Generate optimized instructions for pre AVX512 unsigned conversions from
21853 // vXf32 to vXi32.
21854 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21855 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21856 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21857 assert(!IsSigned && "Expected unsigned conversion!");
21858 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21859 }
21860
21861 return SDValue();
21862 }
21863
21864 assert(!VT.isVector());
21865
21866 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21867
21868 if (!IsSigned && UseSSEReg) {
21869 // Conversions from f32/f64 with AVX512 should be legal.
21870 if (Subtarget.hasAVX512())
21871 return Op;
21872
21873 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21874 // behaves on out of range inputs to generate optimized conversions.
21875 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21876 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21877 unsigned DstBits = VT.getScalarSizeInBits();
21878 APInt UIntLimit = APInt::getSignMask(DstBits);
21879 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21880 DAG.getConstant(UIntLimit, dl, VT));
21881 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21882
21883 // Calculate the converted result for values in the range:
21884 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21885 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21886 SDValue Small =
21887 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21888 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21889 SDValue Big = DAG.getNode(
21890 X86ISD::CVTTS2SI, dl, VT,
21891 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21892 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21893
21894 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21895 // and only if the value was out of range. So we can use that
21896 // as our indicator that we rather use "Big" instead of "Small".
21897 //
21898 // Use "Small" if "IsOverflown" has all bits cleared
21899 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21900 SDValue IsOverflown = DAG.getNode(
21901 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21902 return DAG.getNode(ISD::OR, dl, VT, Small,
21903 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21904 }
21905
21906 // Use default expansion for i64.
21907 if (VT == MVT::i64)
21908 return SDValue();
21909
21910 assert(VT == MVT::i32 && "Unexpected VT!");
21911
21912 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21913 // FIXME: This does not generate an invalid exception if the input does not
21914 // fit in i32. PR44019
21915 if (Subtarget.is64Bit()) {
21916 if (IsStrict) {
21917 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21918 {Chain, Src});
21919 Chain = Res.getValue(1);
21920 } else
21921 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21922
21923 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21924 if (IsStrict)
21925 return DAG.getMergeValues({Res, Chain}, dl);
21926 return Res;
21927 }
21928
21929 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21930 // use fisttp which will be handled later.
21931 if (!Subtarget.hasSSE3())
21932 return SDValue();
21933 }
21934
21935 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21936 // FIXME: This does not generate an invalid exception if the input does not
21937 // fit in i16. PR44019
21938 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21939 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21940 if (IsStrict) {
21941 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21942 {Chain, Src});
21943 Chain = Res.getValue(1);
21944 } else
21945 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21946
21947 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21948 if (IsStrict)
21949 return DAG.getMergeValues({Res, Chain}, dl);
21950 return Res;
21951 }
21952
21953 // If this is a FP_TO_SINT using SSEReg we're done.
21954 if (UseSSEReg && IsSigned)
21955 return Op;
21956
21957 // fp128 needs to use a libcall.
21958 if (SrcVT == MVT::f128) {
21959 RTLIB::Libcall LC;
21960 if (IsSigned)
21961 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21962 else
21963 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21964
21965 MakeLibCallOptions CallOptions;
21966 std::pair<SDValue, SDValue> Tmp =
21967 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21968
21969 if (IsStrict)
21970 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21971
21972 return Tmp.first;
21973 }
21974
21975 // Fall back to X87.
21976 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21977 if (IsStrict)
21978 return DAG.getMergeValues({V, Chain}, dl);
21979 return V;
21980 }
21981
21982 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21983}
21984
21985SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21986 SelectionDAG &DAG) const {
21987 SDValue Src = Op.getOperand(0);
21988 EVT DstVT = Op.getSimpleValueType();
21989 MVT SrcVT = Src.getSimpleValueType();
21990
21991 if (SrcVT.isVector())
21992 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21993
21994 if (SrcVT == MVT::f16)
21995 return SDValue();
21996
21997 // If the source is in an SSE register, the node is Legal.
21998 if (isScalarFPTypeInSSEReg(SrcVT))
21999 return Op;
22000
22001 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22002}
22003
22004SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22005 SelectionDAG &DAG) const {
22006 EVT DstVT = N->getValueType(0);
22007 SDValue Src = N->getOperand(0);
22008 EVT SrcVT = Src.getValueType();
22009
22010 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22011 // f16 must be promoted before using the lowering in this routine.
22012 // fp128 does not use this lowering.
22013 return SDValue();
22014 }
22015
22016 SDLoc DL(N);
22017 SDValue Chain = DAG.getEntryNode();
22018
22019 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22020
22021 // If we're converting from SSE, the stack slot needs to hold both types.
22022 // Otherwise it only needs to hold the DstVT.
22023 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22024 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22026 MachinePointerInfo MPI =
22028
22029 if (UseSSE) {
22030 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22031 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22032 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22033 SDValue Ops[] = { Chain, StackPtr };
22034
22035 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22036 /*Align*/ std::nullopt,
22038 Chain = Src.getValue(1);
22039 }
22040
22041 SDValue StoreOps[] = { Chain, Src, StackPtr };
22042 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22043 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22045
22046 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22047}
22048
22049SDValue
22050X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22051 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22052 // but making use of X86 specifics to produce better instruction sequences.
22053 SDNode *Node = Op.getNode();
22054 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22055 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22056 SDLoc dl(SDValue(Node, 0));
22057 SDValue Src = Node->getOperand(0);
22058
22059 // There are three types involved here: SrcVT is the source floating point
22060 // type, DstVT is the type of the result, and TmpVT is the result of the
22061 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22062 // DstVT).
22063 EVT SrcVT = Src.getValueType();
22064 EVT DstVT = Node->getValueType(0);
22065 EVT TmpVT = DstVT;
22066
22067 // This code is only for floats and doubles. Fall back to generic code for
22068 // anything else.
22069 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22070 return SDValue();
22071
22072 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22073 unsigned SatWidth = SatVT.getScalarSizeInBits();
22074 unsigned DstWidth = DstVT.getScalarSizeInBits();
22075 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22076 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22077 "Expected saturation width smaller than result width");
22078
22079 // Promote result of FP_TO_*INT to at least 32 bits.
22080 if (TmpWidth < 32) {
22081 TmpVT = MVT::i32;
22082 TmpWidth = 32;
22083 }
22084
22085 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22086 // us to use a native signed conversion instead.
22087 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22088 TmpVT = MVT::i64;
22089 TmpWidth = 64;
22090 }
22091
22092 // If the saturation width is smaller than the size of the temporary result,
22093 // we can always use signed conversion, which is native.
22094 if (SatWidth < TmpWidth)
22095 FpToIntOpcode = ISD::FP_TO_SINT;
22096
22097 // Determine minimum and maximum integer values and their corresponding
22098 // floating-point values.
22099 APInt MinInt, MaxInt;
22100 if (IsSigned) {
22101 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22102 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22103 } else {
22104 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22105 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22106 }
22107
22108 const fltSemantics &Sem = SrcVT.getFltSemantics();
22109 APFloat MinFloat(Sem);
22110 APFloat MaxFloat(Sem);
22111
22112 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22113 MinInt, IsSigned, APFloat::rmTowardZero);
22114 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22115 MaxInt, IsSigned, APFloat::rmTowardZero);
22116 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22117 && !(MaxStatus & APFloat::opStatus::opInexact);
22118
22119 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22120 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22121
22122 // If the integer bounds are exactly representable as floats, emit a
22123 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22124 if (AreExactFloatBounds) {
22125 if (DstVT != TmpVT) {
22126 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22127 SDValue MinClamped = DAG.getNode(
22128 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22129 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22130 SDValue BothClamped = DAG.getNode(
22131 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22132 // Convert clamped value to integer.
22133 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22134
22135 // NaN will become INDVAL, with the top bit set and the rest zero.
22136 // Truncation will discard the top bit, resulting in zero.
22137 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22138 }
22139
22140 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22141 SDValue MinClamped = DAG.getNode(
22142 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22143 // Clamp by MaxFloat from above. NaN cannot occur.
22144 SDValue BothClamped = DAG.getNode(
22145 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22146 // Convert clamped value to integer.
22147 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22148
22149 if (!IsSigned) {
22150 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22151 // which is zero.
22152 return FpToInt;
22153 }
22154
22155 // Otherwise, select zero if Src is NaN.
22156 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22157 return DAG.getSelectCC(
22158 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22159 }
22160
22161 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22162 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22163
22164 // Result of direct conversion, which may be selected away.
22165 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22166
22167 if (DstVT != TmpVT) {
22168 // NaN will become INDVAL, with the top bit set and the rest zero.
22169 // Truncation will discard the top bit, resulting in zero.
22170 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22171 }
22172
22173 SDValue Select = FpToInt;
22174 // For signed conversions where we saturate to the same size as the
22175 // result type of the fptoi instructions, INDVAL coincides with integer
22176 // minimum, so we don't need to explicitly check it.
22177 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22178 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22179 // MinInt if Src is NaN.
22180 Select = DAG.getSelectCC(
22181 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22182 }
22183
22184 // If Src OGT MaxFloat, select MaxInt.
22185 Select = DAG.getSelectCC(
22186 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22187
22188 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22189 // is already zero. The promoted case was already handled above.
22190 if (!IsSigned || DstVT != TmpVT) {
22191 return Select;
22192 }
22193
22194 // Otherwise, select 0 if Src is NaN.
22195 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22196 return DAG.getSelectCC(
22197 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22198}
22199
22200SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22201 bool IsStrict = Op->isStrictFPOpcode();
22202
22203 SDLoc DL(Op);
22204 MVT VT = Op.getSimpleValueType();
22205 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22206 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22207 MVT SVT = In.getSimpleValueType();
22208
22209 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22210 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22211 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22212 !Subtarget.getTargetTriple().isOSDarwin()))
22213 return SDValue();
22214
22215 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22216 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22217 return Op;
22218
22219 if (SVT == MVT::f16) {
22220 if (Subtarget.hasFP16())
22221 return Op;
22222
22223 if (VT != MVT::f32) {
22224 if (IsStrict)
22225 return DAG.getNode(
22226 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22227 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22228 {MVT::f32, MVT::Other}, {Chain, In})});
22229
22230 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22231 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22232 }
22233
22234 if (!Subtarget.hasF16C()) {
22235 if (!Subtarget.getTargetTriple().isOSDarwin())
22236 return SDValue();
22237
22238 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22239
22240 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22241 TargetLowering::CallLoweringInfo CLI(DAG);
22242 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22243
22244 In = DAG.getBitcast(MVT::i16, In);
22246 TargetLowering::ArgListEntry Entry(
22247 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22248 Entry.IsSExt = false;
22249 Entry.IsZExt = true;
22250 Args.push_back(Entry);
22251
22253 getLibcallName(RTLIB::FPEXT_F16_F32),
22255 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22256 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22257 std::move(Args));
22258
22259 SDValue Res;
22260 std::tie(Res,Chain) = LowerCallTo(CLI);
22261 if (IsStrict)
22262 Res = DAG.getMergeValues({Res, Chain}, DL);
22263
22264 return Res;
22265 }
22266
22267 In = DAG.getBitcast(MVT::i16, In);
22268 SDValue Res;
22269 if (IsStrict) {
22270 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22271 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22272 DAG.getVectorIdxConstant(0, DL));
22273 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22274 {Chain, In});
22275 Chain = Res.getValue(1);
22276 } else {
22277 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22278 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22279 DAG.getUNDEF(MVT::v4i32), In,
22280 DAG.getVectorIdxConstant(0, DL));
22281 In = DAG.getBitcast(MVT::v8i16, In);
22282 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22283 DAG.getTargetConstant(4, DL, MVT::i32));
22284 }
22285 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22286 DAG.getVectorIdxConstant(0, DL));
22287 if (IsStrict)
22288 return DAG.getMergeValues({Res, Chain}, DL);
22289 return Res;
22290 }
22291
22292 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22293 return Op;
22294
22295 if (SVT.getVectorElementType() == MVT::f16) {
22296 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22297 return Op;
22298 assert(Subtarget.hasF16C() && "Unexpected features!");
22299 if (SVT == MVT::v2f16)
22300 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22301 DAG.getUNDEF(MVT::v2f16));
22302 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22303 DAG.getUNDEF(MVT::v4f16));
22304 if (IsStrict)
22305 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22306 {Op->getOperand(0), Res});
22307 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22308 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22309 return Op;
22310 }
22311
22312 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22313
22314 SDValue Res =
22315 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22316 if (IsStrict)
22317 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22318 {Op->getOperand(0), Res});
22319 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22320}
22321
22322SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22323 bool IsStrict = Op->isStrictFPOpcode();
22324
22325 SDLoc DL(Op);
22326 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22327 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22328 MVT VT = Op.getSimpleValueType();
22329 MVT SVT = In.getSimpleValueType();
22330
22331 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22332 return SDValue();
22333
22334 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22335 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22336 if (!Subtarget.getTargetTriple().isOSDarwin())
22337 return SDValue();
22338
22339 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22340 TargetLowering::CallLoweringInfo CLI(DAG);
22341 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22342
22344 TargetLowering::ArgListEntry Entry(
22345 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22346 Entry.IsSExt = false;
22347 Entry.IsZExt = true;
22348 Args.push_back(Entry);
22349
22351 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22352 : RTLIB::FPROUND_F32_F16),
22354 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22355 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22356 std::move(Args));
22357
22358 SDValue Res;
22359 std::tie(Res, Chain) = LowerCallTo(CLI);
22360
22361 Res = DAG.getBitcast(MVT::f16, Res);
22362
22363 if (IsStrict)
22364 Res = DAG.getMergeValues({Res, Chain}, DL);
22365
22366 return Res;
22367 }
22368
22369 if (VT.getScalarType() == MVT::bf16) {
22370 if (SVT.getScalarType() == MVT::f32 &&
22371 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22372 Subtarget.hasAVXNECONVERT()))
22373 return Op;
22374 return SDValue();
22375 }
22376
22377 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22378 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22379 return SDValue();
22380
22381 if (VT.isVector())
22382 return Op;
22383
22384 SDValue Res;
22386 MVT::i32);
22387 if (IsStrict) {
22388 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22389 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22390 DAG.getVectorIdxConstant(0, DL));
22391 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22392 {Chain, Res, Rnd});
22393 Chain = Res.getValue(1);
22394 } else {
22395 // FIXME: Should we use zeros for upper elements for non-strict?
22396 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22397 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22398 }
22399
22400 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22401 DAG.getVectorIdxConstant(0, DL));
22402 Res = DAG.getBitcast(MVT::f16, Res);
22403
22404 if (IsStrict)
22405 return DAG.getMergeValues({Res, Chain}, DL);
22406
22407 return Res;
22408 }
22409
22410 return Op;
22411}
22412
22414 bool IsStrict = Op->isStrictFPOpcode();
22415 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22416 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22417 "Unexpected VT!");
22418
22419 SDLoc dl(Op);
22420 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22421 DAG.getConstant(0, dl, MVT::v8i16), Src,
22422 DAG.getVectorIdxConstant(0, dl));
22423
22424 SDValue Chain;
22425 if (IsStrict) {
22426 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22427 {Op.getOperand(0), Res});
22428 Chain = Res.getValue(1);
22429 } else {
22430 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22431 }
22432
22433 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22434 DAG.getVectorIdxConstant(0, dl));
22435
22436 if (IsStrict)
22437 return DAG.getMergeValues({Res, Chain}, dl);
22438
22439 return Res;
22440}
22441
22443 bool IsStrict = Op->isStrictFPOpcode();
22444 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22445 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22446 "Unexpected VT!");
22447
22448 SDLoc dl(Op);
22449 SDValue Res, Chain;
22450 if (IsStrict) {
22451 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22452 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22453 DAG.getVectorIdxConstant(0, dl));
22454 Res = DAG.getNode(
22455 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22456 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22457 Chain = Res.getValue(1);
22458 } else {
22459 // FIXME: Should we use zeros for upper elements for non-strict?
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22461 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22462 DAG.getTargetConstant(4, dl, MVT::i32));
22463 }
22464
22465 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22466 DAG.getVectorIdxConstant(0, dl));
22467
22468 if (IsStrict)
22469 return DAG.getMergeValues({Res, Chain}, dl);
22470
22471 return Res;
22472}
22473
22474SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22475 SelectionDAG &DAG) const {
22476 SDLoc DL(Op);
22477
22478 MVT SVT = Op.getOperand(0).getSimpleValueType();
22479 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22480 Subtarget.hasAVXNECONVERT())) {
22481 SDValue Res;
22482 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22483 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22484 Res = DAG.getBitcast(MVT::v8i16, Res);
22485 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22486 DAG.getVectorIdxConstant(0, DL));
22487 }
22488
22489 MakeLibCallOptions CallOptions;
22490 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22491 SDValue Res =
22492 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22493 return DAG.getBitcast(MVT::i16, Res);
22494}
22495
22496/// Depending on uarch and/or optimizing for size, we might prefer to use a
22497/// vector operation in place of the typical scalar operation.
22499 SelectionDAG &DAG,
22500 const X86Subtarget &Subtarget) {
22501 // If both operands have other uses, this is probably not profitable.
22502 SDValue LHS = Op.getOperand(0);
22503 SDValue RHS = Op.getOperand(1);
22504 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22505 return Op;
22506
22507 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22508 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22509 if (IsFP && !Subtarget.hasSSE3())
22510 return Op;
22511 if (!IsFP && !Subtarget.hasSSSE3())
22512 return Op;
22513
22514 // Extract from a common vector.
22515 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22517 LHS.getOperand(0) != RHS.getOperand(0) ||
22518 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22519 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22520 !shouldUseHorizontalOp(true, DAG, Subtarget))
22521 return Op;
22522
22523 // Allow commuted 'hadd' ops.
22524 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22525 unsigned HOpcode;
22526 switch (Op.getOpcode()) {
22527 // clang-format off
22528 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22529 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22530 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22531 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22532 default:
22533 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22534 // clang-format on
22535 }
22536 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22537 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22538 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22539 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22540 std::swap(LExtIndex, RExtIndex);
22541
22542 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22543 return Op;
22544
22545 SDValue X = LHS.getOperand(0);
22546 EVT VecVT = X.getValueType();
22547 unsigned BitWidth = VecVT.getSizeInBits();
22548 unsigned NumLanes = BitWidth / 128;
22549 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22550 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22551 "Not expecting illegal vector widths here");
22552
22553 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22554 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22555 if (BitWidth == 256 || BitWidth == 512) {
22556 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22557 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22558 LExtIndex %= NumEltsPerLane;
22559 }
22560
22561 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22563 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22564 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22565 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22567 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22568}
22569
22570/// Depending on uarch and/or optimizing for size, we might prefer to use a
22571/// vector operation in place of the typical scalar operation.
22572SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22573 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22574 "Only expecting float/double");
22575 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22576}
22577
22578/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22579/// This mode isn't supported in hardware on X86. But as long as we aren't
22580/// compiling with trapping math, we can emulate this with
22581/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22583 SDValue N0 = Op.getOperand(0);
22584 SDLoc dl(Op);
22585 MVT VT = Op.getSimpleValueType();
22586
22587 // N0 += copysign(nextafter(0.5, 0.0), N0)
22588 const fltSemantics &Sem = VT.getFltSemantics();
22589 bool Ignored;
22590 APFloat Point5Pred = APFloat(0.5f);
22591 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22592 Point5Pred.next(/*nextDown*/true);
22593
22594 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22595 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22596 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22597
22598 // Truncate the result to remove fraction.
22599 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22600}
22601
22602/// The only differences between FABS and FNEG are the mask and the logic op.
22603/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22605 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22606 "Wrong opcode for lowering FABS or FNEG.");
22607
22608 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22609
22610 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22611 // into an FNABS. We'll lower the FABS after that if it is still in use.
22612 if (IsFABS)
22613 for (SDNode *User : Op->users())
22614 if (User->getOpcode() == ISD::FNEG)
22615 return Op;
22616
22617 SDLoc dl(Op);
22618 MVT VT = Op.getSimpleValueType();
22619
22620 bool IsF128 = (VT == MVT::f128);
22621 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22623 "Unexpected type in LowerFABSorFNEG");
22624
22625 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22626 // decide if we should generate a 16-byte constant mask when we only need 4 or
22627 // 8 bytes for the scalar case.
22628
22629 // There are no scalar bitwise logical SSE/AVX instructions, so we
22630 // generate a 16-byte vector constant and logic op even for the scalar case.
22631 // Using a 16-byte mask allows folding the load of the mask with
22632 // the logic op, so it can save (~4 bytes) on code size.
22633 bool IsFakeVector = !VT.isVector() && !IsF128;
22634 MVT LogicVT = VT;
22635 if (IsFakeVector)
22636 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22637 : (VT == MVT::f32) ? MVT::v4f32
22638 : MVT::v8f16;
22639
22640 unsigned EltBits = VT.getScalarSizeInBits();
22641 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22642 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22643 APInt::getSignMask(EltBits);
22644 const fltSemantics &Sem = VT.getFltSemantics();
22645 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22646
22647 SDValue Op0 = Op.getOperand(0);
22648 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22649 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22650 IsFNABS ? X86ISD::FOR :
22652 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22653
22654 if (VT.isVector() || IsF128)
22655 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22656
22657 // For the scalar case extend to a 128-bit vector, perform the logic op,
22658 // and extract the scalar result back out.
22659 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22660 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22662 DAG.getVectorIdxConstant(0, dl));
22663}
22664
22666 SDValue Mag = Op.getOperand(0);
22667 SDValue Sign = Op.getOperand(1);
22668 SDLoc dl(Op);
22669
22670 // If the sign operand is smaller, extend it first.
22671 MVT VT = Op.getSimpleValueType();
22672 if (Sign.getSimpleValueType().bitsLT(VT))
22673 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22674
22675 // And if it is bigger, shrink it first.
22676 if (Sign.getSimpleValueType().bitsGT(VT))
22677 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22678 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22679
22680 // At this point the operands and the result should have the same
22681 // type, and that won't be f80 since that is not custom lowered.
22682 bool IsF128 = (VT == MVT::f128);
22683 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22685 "Unexpected type in LowerFCOPYSIGN");
22686
22687 const fltSemantics &Sem = VT.getFltSemantics();
22688
22689 // Perform all scalar logic operations as 16-byte vectors because there are no
22690 // scalar FP logic instructions in SSE.
22691 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22692 // unnecessary splats, but we might miss load folding opportunities. Should
22693 // this decision be based on OptimizeForSize?
22694 bool IsFakeVector = !VT.isVector() && !IsF128;
22695 MVT LogicVT = VT;
22696 if (IsFakeVector)
22697 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22698 : (VT == MVT::f32) ? MVT::v4f32
22699 : MVT::v8f16;
22700
22701 // The mask constants are automatically splatted for vector types.
22702 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22703 SDValue SignMask = DAG.getConstantFP(
22704 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22705 SDValue MagMask = DAG.getConstantFP(
22706 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22707
22708 // First, clear all bits but the sign bit from the second operand (sign).
22709 if (IsFakeVector)
22710 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22711 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22712
22713 // Next, clear the sign bit from the first operand (magnitude).
22714 // TODO: If we had general constant folding for FP logic ops, this check
22715 // wouldn't be necessary.
22716 SDValue MagBits;
22717 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22718 APFloat APF = Op0CN->getValueAPF();
22719 APF.clearSign();
22720 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22721 } else {
22722 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22723 if (IsFakeVector)
22724 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22725 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22726 }
22727
22728 // OR the magnitude value with the sign bit.
22729 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22730 return !IsFakeVector ? Or
22731 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22732 DAG.getVectorIdxConstant(0, dl));
22733}
22734
22736 SDValue N0 = Op.getOperand(0);
22737 SDLoc dl(Op);
22738 MVT VT = Op.getSimpleValueType();
22739
22740 MVT OpVT = N0.getSimpleValueType();
22741 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22742 "Unexpected type for FGETSIGN");
22743
22744 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22745 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22746 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22747 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22748 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22749 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22750 return Res;
22751}
22752
22753/// Helper for attempting to create a X86ISD::BT node.
22754static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22755 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22756 // instruction. Since the shift amount is in-range-or-undefined, we know
22757 // that doing a bittest on the i32 value is ok. We extend to i32 because
22758 // the encoding for the i16 version is larger than the i32 version.
22759 // Also promote i16 to i32 for performance / code size reason.
22760 if (Src.getValueType().getScalarSizeInBits() < 32)
22761 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22762
22763 // No legal type found, give up.
22764 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22765 return SDValue();
22766
22767 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22768 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22769 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22770 // known to be zero.
22771 if (Src.getValueType() == MVT::i64 &&
22772 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22773 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22774
22775 // If the operand types disagree, extend the shift amount to match. Since
22776 // BT ignores high bits (like shifts) we can use anyextend.
22777 if (Src.getValueType() != BitNo.getValueType()) {
22778 // Peek through a mask/modulo operation.
22779 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22780 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22781 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22782 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22783 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22784 BitNo.getOperand(0)),
22785 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22786 BitNo.getOperand(1)));
22787 else
22788 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22789 }
22790
22791 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22792}
22793
22794/// Helper for creating a X86ISD::SETCC node.
22796 SelectionDAG &DAG) {
22797 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22798 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22799}
22800
22801/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22802/// recognizable memcmp expansion.
22803static bool isOrXorXorTree(SDValue X, bool Root = true) {
22804 if (X.getOpcode() == ISD::OR)
22805 return isOrXorXorTree(X.getOperand(0), false) &&
22806 isOrXorXorTree(X.getOperand(1), false);
22807 if (Root)
22808 return false;
22809 return X.getOpcode() == ISD::XOR;
22810}
22811
22812/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22813/// expansion.
22814template <typename F>
22816 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22817 SDValue Op0 = X.getOperand(0);
22818 SDValue Op1 = X.getOperand(1);
22819 if (X.getOpcode() == ISD::OR) {
22820 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22822 if (VecVT != CmpVT)
22823 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22824 if (HasPT)
22825 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22826 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22827 }
22828 if (X.getOpcode() == ISD::XOR) {
22829 SDValue A = SToV(Op0);
22830 SDValue B = SToV(Op1);
22831 if (VecVT != CmpVT)
22832 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22833 if (HasPT)
22834 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22835 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22836 }
22837 llvm_unreachable("Impossible");
22838}
22839
22840/// Try to map a 128-bit or larger integer comparison to vector instructions
22841/// before type legalization splits it up into chunks.
22843 ISD::CondCode CC,
22844 const SDLoc &DL,
22845 SelectionDAG &DAG,
22846 const X86Subtarget &Subtarget) {
22847 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22848
22849 // We're looking for an oversized integer equality comparison.
22850 EVT OpVT = X.getValueType();
22851 unsigned OpSize = OpVT.getSizeInBits();
22852 if (!OpVT.isScalarInteger() || OpSize < 128)
22853 return SDValue();
22854
22855 // Ignore a comparison with zero because that gets special treatment in
22856 // EmitTest(). But make an exception for the special case of a pair of
22857 // logically-combined vector-sized operands compared to zero. This pattern may
22858 // be generated by the memcmp expansion pass with oversized integer compares
22859 // (see PR33325).
22860 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22861 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22862 return SDValue();
22863
22864 // Don't perform this combine if constructing the vector will be expensive.
22865 auto IsVectorBitCastCheap = [](SDValue X) {
22867 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22868 X.getOpcode() == ISD::LOAD;
22869 };
22870 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22871 !IsOrXorXorTreeCCZero)
22872 return SDValue();
22873
22874 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22875 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22876 // Otherwise use PCMPEQ (plus AND) and mask testing.
22877 bool NoImplicitFloatOps =
22879 Attribute::NoImplicitFloat);
22880 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22881 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22882 (OpSize == 256 && Subtarget.hasAVX()) ||
22883 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22884 bool HasPT = Subtarget.hasSSE41();
22885
22886 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22887 // vector registers are essentially free. (Technically, widening registers
22888 // prevents load folding, but the tradeoff is worth it.)
22889 bool PreferKOT = Subtarget.preferMaskRegisters();
22890 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22891
22892 EVT VecVT = MVT::v16i8;
22893 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22894 if (OpSize == 256) {
22895 VecVT = MVT::v32i8;
22896 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22897 }
22898 EVT CastVT = VecVT;
22899 bool NeedsAVX512FCast = false;
22900 if (OpSize == 512 || NeedZExt) {
22901 if (Subtarget.hasBWI()) {
22902 VecVT = MVT::v64i8;
22903 CmpVT = MVT::v64i1;
22904 if (OpSize == 512)
22905 CastVT = VecVT;
22906 } else {
22907 VecVT = MVT::v16i32;
22908 CmpVT = MVT::v16i1;
22909 CastVT = OpSize == 512 ? VecVT
22910 : OpSize == 256 ? MVT::v8i32
22911 : MVT::v4i32;
22912 NeedsAVX512FCast = true;
22913 }
22914 }
22915
22916 auto ScalarToVector = [&](SDValue X) -> SDValue {
22917 bool TmpZext = false;
22918 EVT TmpCastVT = CastVT;
22919 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22920 SDValue OrigX = X.getOperand(0);
22921 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22922 if (OrigSize < OpSize) {
22923 if (OrigSize == 128) {
22924 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22925 X = OrigX;
22926 TmpZext = true;
22927 } else if (OrigSize == 256) {
22928 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22929 X = OrigX;
22930 TmpZext = true;
22931 }
22932 }
22933 }
22934 X = DAG.getBitcast(TmpCastVT, X);
22935 if (!NeedZExt && !TmpZext)
22936 return X;
22937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22938 DAG.getConstant(0, DL, VecVT), X,
22939 DAG.getVectorIdxConstant(0, DL));
22940 };
22941
22942 SDValue Cmp;
22943 if (IsOrXorXorTreeCCZero) {
22944 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22945 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22946 // Use 2 vector equality compares and 'and' the results before doing a
22947 // MOVMSK.
22948 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22949 } else {
22950 SDValue VecX = ScalarToVector(X);
22951 SDValue VecY = ScalarToVector(Y);
22952 if (VecVT != CmpVT) {
22953 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22954 } else if (HasPT) {
22955 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22956 } else {
22957 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22958 }
22959 }
22960 // AVX512 should emit a setcc that will lower to kortest.
22961 if (VecVT != CmpVT) {
22962 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22963 : CmpVT == MVT::v32i1 ? MVT::i32
22964 : MVT::i16;
22965 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22966 DAG.getConstant(0, DL, KRegVT), CC);
22967 }
22968 if (HasPT) {
22969 SDValue BCCmp =
22970 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22971 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22973 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22974 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22975 }
22976 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22977 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22978 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22979 assert(Cmp.getValueType() == MVT::v16i8 &&
22980 "Non 128-bit vector on pre-SSE41 target");
22981 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22982 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22983 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22984 }
22985
22986 return SDValue();
22987}
22988
22989/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22990/// style scalarized (associative) reduction patterns. Partial reductions
22991/// are supported when the pointer SrcMask is non-null.
22992/// TODO - move this to SelectionDAG?
22995 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22997 DenseMap<SDValue, APInt> SrcOpMap;
22998 EVT VT = MVT::Other;
22999
23000 // Recognize a special case where a vector is casted into wide integer to
23001 // test all 0s.
23002 assert(Op.getOpcode() == unsigned(BinOp) &&
23003 "Unexpected bit reduction opcode");
23004 Opnds.push_back(Op.getOperand(0));
23005 Opnds.push_back(Op.getOperand(1));
23006
23007 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23009 // BFS traverse all BinOp operands.
23010 if (I->getOpcode() == unsigned(BinOp)) {
23011 Opnds.push_back(I->getOperand(0));
23012 Opnds.push_back(I->getOperand(1));
23013 // Re-evaluate the number of nodes to be traversed.
23014 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23015 continue;
23016 }
23017
23018 // Quit if a non-EXTRACT_VECTOR_ELT
23019 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23020 return false;
23021
23022 // Quit if without a constant index.
23023 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23024 if (!Idx)
23025 return false;
23026
23027 SDValue Src = I->getOperand(0);
23028 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23029 if (M == SrcOpMap.end()) {
23030 VT = Src.getValueType();
23031 // Quit if not the same type.
23032 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23033 return false;
23034 unsigned NumElts = VT.getVectorNumElements();
23035 APInt EltCount = APInt::getZero(NumElts);
23036 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23037 SrcOps.push_back(Src);
23038 }
23039
23040 // Quit if element already used.
23041 unsigned CIdx = Idx->getZExtValue();
23042 if (M->second[CIdx])
23043 return false;
23044 M->second.setBit(CIdx);
23045 }
23046
23047 if (SrcMask) {
23048 // Collect the source partial masks.
23049 for (SDValue &SrcOp : SrcOps)
23050 SrcMask->push_back(SrcOpMap[SrcOp]);
23051 } else {
23052 // Quit if not all elements are used.
23053 for (const auto &I : SrcOpMap)
23054 if (!I.second.isAllOnes())
23055 return false;
23056 }
23057
23058 return true;
23059}
23060
23061// Helper function for comparing all bits of two vectors.
23063 ISD::CondCode CC, const APInt &OriginalMask,
23064 const X86Subtarget &Subtarget,
23065 SelectionDAG &DAG, X86::CondCode &X86CC) {
23066 EVT VT = LHS.getValueType();
23067 unsigned ScalarSize = VT.getScalarSizeInBits();
23068 if (OriginalMask.getBitWidth() != ScalarSize) {
23069 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23070 return SDValue();
23071 }
23072
23073 // Quit if not convertable to legal scalar or 128/256-bit vector.
23075 return SDValue();
23076
23077 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23078 if (VT.isFloatingPoint())
23079 return SDValue();
23080
23081 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23082 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23083
23084 APInt Mask = OriginalMask;
23085
23086 auto MaskBits = [&](SDValue Src) {
23087 if (Mask.isAllOnes())
23088 return Src;
23089 EVT SrcVT = Src.getValueType();
23090 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23091 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23092 };
23093
23094 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23095 if (VT.getSizeInBits() < 128) {
23096 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23097 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23098 if (IntVT != MVT::i64)
23099 return SDValue();
23100 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23101 MVT::i32, MVT::i32);
23102 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23103 MVT::i32, MVT::i32);
23104 SDValue Lo =
23105 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23106 SDValue Hi =
23107 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23108 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23109 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23110 DAG.getConstant(0, DL, MVT::i32));
23111 }
23112 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23113 DAG.getBitcast(IntVT, MaskBits(LHS)),
23114 DAG.getBitcast(IntVT, MaskBits(RHS)));
23115 }
23116
23117 // Without PTEST, a masked v2i64 or-reduction is not faster than
23118 // scalarization.
23119 bool UseKORTEST = Subtarget.useAVX512Regs();
23120 bool UsePTEST = Subtarget.hasSSE41();
23121 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23122 return SDValue();
23123
23124 // Split down to 128/256/512-bit vector.
23125 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23126
23127 // If the input vector has vector elements wider than the target test size,
23128 // then cast to <X x i64> so it will safely split.
23129 if (ScalarSize > TestSize) {
23130 if (!Mask.isAllOnes())
23131 return SDValue();
23132 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23133 LHS = DAG.getBitcast(VT, LHS);
23134 RHS = DAG.getBitcast(VT, RHS);
23135 Mask = APInt::getAllOnes(64);
23136 }
23137
23138 if (VT.getSizeInBits() > TestSize) {
23139 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23140 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23141 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23142 while (VT.getSizeInBits() > TestSize) {
23143 auto Split = DAG.SplitVector(LHS, DL);
23144 VT = Split.first.getValueType();
23145 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23146 }
23147 RHS = DAG.getAllOnesConstant(DL, VT);
23148 } else if (!UsePTEST && !KnownRHS.isZero()) {
23149 // MOVMSK Special Case:
23150 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23151 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23152 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23153 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23154 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23155 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23156 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23157 V = DAG.getSExtOrTrunc(V, DL, VT);
23158 while (VT.getSizeInBits() > TestSize) {
23159 auto Split = DAG.SplitVector(V, DL);
23160 VT = Split.first.getValueType();
23161 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23162 }
23163 V = DAG.getNOT(DL, V, VT);
23164 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23165 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23166 DAG.getConstant(0, DL, MVT::i32));
23167 } else {
23168 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23169 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23170 while (VT.getSizeInBits() > TestSize) {
23171 auto Split = DAG.SplitVector(V, DL);
23172 VT = Split.first.getValueType();
23173 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23174 }
23175 LHS = V;
23176 RHS = DAG.getConstant(0, DL, VT);
23177 }
23178 }
23179
23180 if (UseKORTEST && VT.is512BitVector()) {
23181 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23182 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23183 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23184 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23185 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23186 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23187 }
23188
23189 if (UsePTEST) {
23190 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23191 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23192 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23193 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23194 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23195 }
23196
23197 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23198 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23199 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23200 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23201 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23202 V = DAG.getNOT(DL, V, MaskVT);
23203 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23204 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23205 DAG.getConstant(0, DL, MVT::i32));
23206}
23207
23208// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23209// to CMP(MOVMSK(PCMPEQB(X,Y))).
23211 ISD::CondCode CC, const SDLoc &DL,
23212 const X86Subtarget &Subtarget,
23213 SelectionDAG &DAG,
23214 X86::CondCode &X86CC) {
23215 SDValue Op = OrigLHS;
23216
23217 bool CmpNull;
23218 APInt Mask;
23219 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23220 CmpNull = isNullConstant(OrigRHS);
23221 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23222 return SDValue();
23223
23224 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23225 return SDValue();
23226
23227 // Check whether we're masking/truncating an OR-reduction result, in which
23228 // case track the masked bits.
23229 // TODO: Add CmpAllOnes support.
23230 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23231 if (CmpNull) {
23232 switch (Op.getOpcode()) {
23233 case ISD::TRUNCATE: {
23234 SDValue Src = Op.getOperand(0);
23235 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23236 Op.getScalarValueSizeInBits());
23237 Op = Src;
23238 break;
23239 }
23240 case ISD::AND: {
23241 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23242 Mask = Cst->getAPIntValue();
23243 Op = Op.getOperand(0);
23244 }
23245 break;
23246 }
23247 }
23248 }
23249 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23250 CC = ISD::SETEQ;
23251 CmpNull = true;
23252 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23253 } else {
23254 return SDValue();
23255 }
23256
23257 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23258
23259 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23260 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23262 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23263 EVT VT = VecIns[0].getValueType();
23264 assert(llvm::all_of(VecIns,
23265 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23266 "Reduction source vector mismatch");
23267
23268 // Quit if not splittable to scalar/128/256/512-bit vector.
23270 return SDValue();
23271
23272 // If more than one full vector is evaluated, AND/OR them first before
23273 // PTEST.
23274 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23275 Slot += 2, e += 1) {
23276 // Each iteration will AND/OR 2 nodes and append the result until there is
23277 // only 1 node left, i.e. the final value of all vectors.
23278 SDValue LHS = VecIns[Slot];
23279 SDValue RHS = VecIns[Slot + 1];
23280 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23281 }
23282
23283 return LowerVectorAllEqual(DL, VecIns.back(),
23284 CmpNull ? DAG.getConstant(0, DL, VT)
23285 : DAG.getAllOnesConstant(DL, VT),
23286 CC, Mask, Subtarget, DAG, X86CC);
23287 }
23288
23289 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23290 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23291 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23292 ISD::NodeType BinOp;
23293 if (SDValue Match =
23294 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23295 EVT MatchVT = Match.getValueType();
23296 return LowerVectorAllEqual(DL, Match,
23297 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23298 : DAG.getAllOnesConstant(DL, MatchVT),
23299 CC, Mask, Subtarget, DAG, X86CC);
23300 }
23301 }
23302
23303 if (Mask.isAllOnes()) {
23304 assert(!Op.getValueType().isVector() &&
23305 "Illegal vector type for reduction pattern");
23307 if (Src.getValueType().isFixedLengthVector() &&
23308 Src.getValueType().getScalarType() == MVT::i1) {
23309 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23310 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23311 if (Src.getOpcode() == ISD::SETCC) {
23312 SDValue LHS = Src.getOperand(0);
23313 SDValue RHS = Src.getOperand(1);
23314 EVT LHSVT = LHS.getValueType();
23315 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23316 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23318 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23319 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23320 X86CC);
23321 }
23322 }
23323 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23324 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23325 // Peek through truncation, mask the LSB and compare against zero/LSB.
23326 if (Src.getOpcode() == ISD::TRUNCATE) {
23327 SDValue Inner = Src.getOperand(0);
23328 EVT InnerVT = Inner.getValueType();
23330 unsigned BW = InnerVT.getScalarSizeInBits();
23331 APInt SrcMask = APInt(BW, 1);
23332 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23333 return LowerVectorAllEqual(DL, Inner,
23334 DAG.getConstant(Cmp, DL, InnerVT), CC,
23335 SrcMask, Subtarget, DAG, X86CC);
23336 }
23337 }
23338 }
23339 }
23340
23341 return SDValue();
23342}
23343
23344/// return true if \c Op has a use that doesn't just read flags.
23346 for (SDUse &Use : Op->uses()) {
23347 SDNode *User = Use.getUser();
23348 unsigned UOpNo = Use.getOperandNo();
23349 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23350 // Look past truncate.
23351 UOpNo = User->use_begin()->getOperandNo();
23352 User = User->use_begin()->getUser();
23353 }
23354
23355 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23356 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23357 return true;
23358 }
23359 return false;
23360}
23361
23362// Transform to an x86-specific ALU node with flags if there is a chance of
23363// using an RMW op or only the flags are used. Otherwise, leave
23364// the node alone and emit a 'cmp' or 'test' instruction.
23366 for (SDNode *U : Op->users())
23367 if (U->getOpcode() != ISD::CopyToReg &&
23368 U->getOpcode() != ISD::SETCC &&
23369 U->getOpcode() != ISD::STORE)
23370 return false;
23371
23372 return true;
23373}
23374
23375/// Emit nodes that will be selected as "test Op0,Op0", or something
23376/// equivalent.
23378 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23379 // CF and OF aren't always set the way we want. Determine which
23380 // of these we need.
23381 bool NeedCF = false;
23382 bool NeedOF = false;
23383 switch (X86CC) {
23384 default: break;
23385 case X86::COND_A: case X86::COND_AE:
23386 case X86::COND_B: case X86::COND_BE:
23387 NeedCF = true;
23388 break;
23389 case X86::COND_G: case X86::COND_GE:
23390 case X86::COND_L: case X86::COND_LE:
23391 case X86::COND_O: case X86::COND_NO: {
23392 // Check if we really need to set the
23393 // Overflow flag. If NoSignedWrap is present
23394 // that is not actually needed.
23395 switch (Op->getOpcode()) {
23396 case ISD::ADD:
23397 case ISD::SUB:
23398 case ISD::MUL:
23399 case ISD::SHL:
23400 if (Op.getNode()->getFlags().hasNoSignedWrap())
23401 break;
23402 [[fallthrough]];
23403 default:
23404 NeedOF = true;
23405 break;
23406 }
23407 break;
23408 }
23409 }
23410 // See if we can use the EFLAGS value from the operand instead of
23411 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23412 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23413 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23414 // Emit a CMP with 0, which is the TEST pattern.
23415 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23416 DAG.getConstant(0, dl, Op.getValueType()));
23417 }
23418 unsigned Opcode = 0;
23419 unsigned NumOperands = 0;
23420
23421 SDValue ArithOp = Op;
23422
23423 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23424 // which may be the result of a CAST. We use the variable 'Op', which is the
23425 // non-casted variable when we check for possible users.
23426 switch (ArithOp.getOpcode()) {
23427 case ISD::AND:
23428 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23429 // because a TEST instruction will be better.
23430 if (!hasNonFlagsUse(Op))
23431 break;
23432
23433 [[fallthrough]];
23434 case ISD::ADD:
23435 case ISD::SUB:
23436 case ISD::OR:
23437 case ISD::XOR:
23439 break;
23440
23441 // Otherwise use a regular EFLAGS-setting instruction.
23442 switch (ArithOp.getOpcode()) {
23443 // clang-format off
23444 default: llvm_unreachable("unexpected operator!");
23445 case ISD::ADD: Opcode = X86ISD::ADD; break;
23446 case ISD::SUB: Opcode = X86ISD::SUB; break;
23447 case ISD::XOR: Opcode = X86ISD::XOR; break;
23448 case ISD::AND: Opcode = X86ISD::AND; break;
23449 case ISD::OR: Opcode = X86ISD::OR; break;
23450 // clang-format on
23451 }
23452
23453 NumOperands = 2;
23454 break;
23455 case X86ISD::ADD:
23456 case X86ISD::SUB:
23457 case X86ISD::OR:
23458 case X86ISD::XOR:
23459 case X86ISD::AND:
23460 return SDValue(Op.getNode(), 1);
23461 case ISD::SSUBO:
23462 case ISD::USUBO: {
23463 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23464 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23465 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23466 Op->getOperand(1)).getValue(1);
23467 }
23468 default:
23469 break;
23470 }
23471
23472 if (Opcode == 0) {
23473 // Emit a CMP with 0, which is the TEST pattern.
23474 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23475 DAG.getConstant(0, dl, Op.getValueType()));
23476 }
23477 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23478 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23479
23480 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23481 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23482 return SDValue(New.getNode(), 1);
23483}
23484
23485/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23486/// equivalent.
23488 const SDLoc &dl, SelectionDAG &DAG,
23489 const X86Subtarget &Subtarget) {
23490 if (isNullConstant(Op1))
23491 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23492
23493 EVT CmpVT = Op0.getValueType();
23494
23495 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23496 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23497
23498 // Only promote the compare up to I32 if it is a 16 bit operation
23499 // with an immediate. 16 bit immediates are to be avoided unless the target
23500 // isn't slowed down by length changing prefixes, we're optimizing for
23501 // codesize or the comparison is with a folded load.
23502 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23503 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23505 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23506 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23507 // Don't do this if the immediate can fit in 8-bits.
23508 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23509 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23510 unsigned ExtendOp =
23512 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23513 // For equality comparisons try to use SIGN_EXTEND if the input was
23514 // truncate from something with enough sign bits.
23515 if (Op0.getOpcode() == ISD::TRUNCATE) {
23516 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23517 ExtendOp = ISD::SIGN_EXTEND;
23518 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23519 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23520 ExtendOp = ISD::SIGN_EXTEND;
23521 }
23522 }
23523
23524 CmpVT = MVT::i32;
23525 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23526 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23527 }
23528 }
23529
23530 // Try to shrink i64 compares if the input has enough zero bits.
23531 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23532 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23533 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23534 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23535 CmpVT = MVT::i32;
23536 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23537 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23538 }
23539
23540 // Try to shrink all i64 compares if the inputs are representable as signed
23541 // i32.
23542 if (CmpVT == MVT::i64 &&
23543 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23544 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23545 CmpVT = MVT::i32;
23546 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23547 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23548 }
23549
23550 // 0-x == y --> x+y == 0
23551 // 0-x != y --> x+y != 0
23552 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23553 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23556 return Add.getValue(1);
23557 }
23558
23559 // x == 0-y --> x+y == 0
23560 // x != 0-y --> x+y != 0
23561 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23562 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23563 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23564 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23565 return Add.getValue(1);
23566 }
23567
23568 // If we already have an XOR of the ops, use that to check for equality.
23569 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23570 unsigned X86Opc = X86ISD::SUB;
23571 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23572 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23573 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23574 X86Opc = X86ISD::XOR;
23575
23576 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23577 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23578 return CmpOp.getValue(1);
23579}
23580
23585
23586bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23587 SDNode *N, SDValue, SDValue IntPow2) const {
23588 if (N->getOpcode() == ISD::FDIV)
23589 return true;
23590
23591 EVT FPVT = N->getValueType(0);
23592 EVT IntVT = IntPow2.getValueType();
23593
23594 // This indicates a non-free bitcast.
23595 // TODO: This is probably overly conservative as we will need to scale the
23596 // integer vector anyways for the int->fp cast.
23597 if (FPVT.isVector() &&
23598 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23599 return false;
23600
23601 return true;
23602}
23603
23604/// Check if replacement of SQRT with RSQRT should be disabled.
23605bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23606 EVT VT = Op.getValueType();
23607
23608 // We don't need to replace SQRT with RSQRT for half type.
23609 if (VT.getScalarType() == MVT::f16)
23610 return true;
23611
23612 // We never want to use both SQRT and RSQRT instructions for the same input.
23613 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23614 return false;
23615
23616 if (VT.isVector())
23617 return Subtarget.hasFastVectorFSQRT();
23618 return Subtarget.hasFastScalarFSQRT();
23619}
23620
23621/// The minimum architected relative accuracy is 2^-12. We need one
23622/// Newton-Raphson step to have a good float result (24 bits of precision).
23623SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23624 SelectionDAG &DAG, int Enabled,
23625 int &RefinementSteps,
23626 bool &UseOneConstNR,
23627 bool Reciprocal) const {
23628 SDLoc DL(Op);
23629 EVT VT = Op.getValueType();
23630
23631 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23632 // It is likely not profitable to do this for f64 because a double-precision
23633 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23634 // instructions: convert to single, rsqrtss, convert back to double, refine
23635 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23636 // along with FMA, this could be a throughput win.
23637 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23638 // after legalize types.
23639 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23641 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23642 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23643 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23644 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23645 RefinementSteps = 1;
23646
23647 UseOneConstNR = false;
23648 // There is no FSQRT for 512-bits, but there is RSQRT14.
23649 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23650 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23651 if (RefinementSteps == 0 && !Reciprocal)
23652 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23653 return Estimate;
23654 }
23655
23656 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23657 Subtarget.hasFP16()) {
23658 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23659 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23660 RefinementSteps = 0;
23661
23662 if (VT == MVT::f16) {
23664 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23665 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23666 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23667 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23668 }
23669
23670 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23671 }
23672 return SDValue();
23673}
23674
23675/// The minimum architected relative accuracy is 2^-12. We need one
23676/// Newton-Raphson step to have a good float result (24 bits of precision).
23677SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23678 int Enabled,
23679 int &RefinementSteps) const {
23680 SDLoc DL(Op);
23681 EVT VT = Op.getValueType();
23682
23683 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23684 // It is likely not profitable to do this for f64 because a double-precision
23685 // reciprocal estimate with refinement on x86 prior to FMA requires
23686 // 15 instructions: convert to single, rcpss, convert back to double, refine
23687 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23688 // along with FMA, this could be a throughput win.
23689
23690 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23692 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23693 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23694 // Enable estimate codegen with 1 refinement step for vector division.
23695 // Scalar division estimates are disabled because they break too much
23696 // real-world code. These defaults are intended to match GCC behavior.
23697 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23698 return SDValue();
23699
23700 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23701 RefinementSteps = 1;
23702
23703 // There is no FSQRT for 512-bits, but there is RCP14.
23704 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23705 return DAG.getNode(Opcode, DL, VT, Op);
23706 }
23707
23708 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23709 Subtarget.hasFP16()) {
23710 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23711 RefinementSteps = 0;
23712
23713 if (VT == MVT::f16) {
23715 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23716 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23717 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23719 }
23720
23721 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23722 }
23723 return SDValue();
23724}
23725
23726/// If we have at least two divisions that use the same divisor, convert to
23727/// multiplication by a reciprocal. This may need to be adjusted for a given
23728/// CPU if a division's cost is not at least twice the cost of a multiplication.
23729/// This is because we still need one division to calculate the reciprocal and
23730/// then we need two multiplies by that reciprocal as replacements for the
23731/// original divisions.
23733 return 2;
23734}
23735
23736SDValue
23737X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23738 SelectionDAG &DAG,
23739 SmallVectorImpl<SDNode *> &Created) const {
23740 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23741 if (isIntDivCheap(N->getValueType(0), Attr))
23742 return SDValue(N,0); // Lower SDIV as SDIV
23743
23744 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23745 "Unexpected divisor!");
23746
23747 // Only perform this transform if CMOV is supported otherwise the select
23748 // below will become a branch.
23749 if (!Subtarget.canUseCMOV())
23750 return SDValue();
23751
23752 // fold (sdiv X, pow2)
23753 EVT VT = N->getValueType(0);
23754 // FIXME: Support i8.
23755 if (VT != MVT::i16 && VT != MVT::i32 &&
23756 !(Subtarget.is64Bit() && VT == MVT::i64))
23757 return SDValue();
23758
23759 // If the divisor is 2 or -2, the default expansion is better.
23760 if (Divisor == 2 ||
23761 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23762 return SDValue();
23763
23764 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23765}
23766
23767/// Result of 'and' is compared against zero. Change to a BT node if possible.
23768/// Returns the BT node and the condition code needed to use it.
23770 SelectionDAG &DAG, X86::CondCode &X86CC) {
23771 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23772 SDValue Op0 = And.getOperand(0);
23773 SDValue Op1 = And.getOperand(1);
23774 if (Op0.getOpcode() == ISD::TRUNCATE)
23775 Op0 = Op0.getOperand(0);
23776 if (Op1.getOpcode() == ISD::TRUNCATE)
23777 Op1 = Op1.getOperand(0);
23778
23779 SDValue Src, BitNo;
23780 if (Op1.getOpcode() == ISD::SHL)
23781 std::swap(Op0, Op1);
23782 if (Op0.getOpcode() == ISD::SHL) {
23783 if (isOneConstant(Op0.getOperand(0))) {
23784 // If we looked past a truncate, check that it's only truncating away
23785 // known zeros.
23786 unsigned BitWidth = Op0.getValueSizeInBits();
23787 unsigned AndBitWidth = And.getValueSizeInBits();
23788 if (BitWidth > AndBitWidth) {
23789 KnownBits Known = DAG.computeKnownBits(Op0);
23790 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23791 return SDValue();
23792 }
23793 Src = Op1;
23794 BitNo = Op0.getOperand(1);
23795 }
23796 } else if (Op1.getOpcode() == ISD::Constant) {
23797 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23798 uint64_t AndRHSVal = AndRHS->getZExtValue();
23799 SDValue AndLHS = Op0;
23800
23801 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23802 Src = AndLHS.getOperand(0);
23803 BitNo = AndLHS.getOperand(1);
23804 } else {
23805 // Use BT if the immediate can't be encoded in a TEST instruction or we
23806 // are optimizing for size and the immedaite won't fit in a byte.
23807 bool OptForSize = DAG.shouldOptForSize();
23808 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23809 isPowerOf2_64(AndRHSVal)) {
23810 Src = AndLHS;
23811 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23812 Src.getValueType());
23813 }
23814 }
23815 }
23816
23817 // No patterns found, give up.
23818 if (!Src.getNode())
23819 return SDValue();
23820
23821 // Remove any bit flip.
23822 if (isBitwiseNot(Src)) {
23823 Src = Src.getOperand(0);
23824 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23825 }
23826
23827 // Attempt to create the X86ISD::BT node.
23828 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23829 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23830 return BT;
23831 }
23832
23833 return SDValue();
23834}
23835
23836// Check if pre-AVX condcode can be performed by a single FCMP op.
23837static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23838 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23839}
23840
23841/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23842/// CMPs.
23843static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23844 SDValue &Op1, bool &IsAlwaysSignaling) {
23845 unsigned SSECC;
23846 bool Swap = false;
23847
23848 // SSE Condition code mapping:
23849 // 0 - EQ
23850 // 1 - LT
23851 // 2 - LE
23852 // 3 - UNORD
23853 // 4 - NEQ
23854 // 5 - NLT
23855 // 6 - NLE
23856 // 7 - ORD
23857 switch (SetCCOpcode) {
23858 // clang-format off
23859 default: llvm_unreachable("Unexpected SETCC condition");
23860 case ISD::SETOEQ:
23861 case ISD::SETEQ: SSECC = 0; break;
23862 case ISD::SETOGT:
23863 case ISD::SETGT: Swap = true; [[fallthrough]];
23864 case ISD::SETLT:
23865 case ISD::SETOLT: SSECC = 1; break;
23866 case ISD::SETOGE:
23867 case ISD::SETGE: Swap = true; [[fallthrough]];
23868 case ISD::SETLE:
23869 case ISD::SETOLE: SSECC = 2; break;
23870 case ISD::SETUO: SSECC = 3; break;
23871 case ISD::SETUNE:
23872 case ISD::SETNE: SSECC = 4; break;
23873 case ISD::SETULE: Swap = true; [[fallthrough]];
23874 case ISD::SETUGE: SSECC = 5; break;
23875 case ISD::SETULT: Swap = true; [[fallthrough]];
23876 case ISD::SETUGT: SSECC = 6; break;
23877 case ISD::SETO: SSECC = 7; break;
23878 case ISD::SETUEQ: SSECC = 8; break;
23879 case ISD::SETONE: SSECC = 12; break;
23880 // clang-format on
23881 }
23882 if (Swap)
23883 std::swap(Op0, Op1);
23884
23885 switch (SetCCOpcode) {
23886 default:
23887 IsAlwaysSignaling = true;
23888 break;
23889 case ISD::SETEQ:
23890 case ISD::SETOEQ:
23891 case ISD::SETUEQ:
23892 case ISD::SETNE:
23893 case ISD::SETONE:
23894 case ISD::SETUNE:
23895 case ISD::SETO:
23896 case ISD::SETUO:
23897 IsAlwaysSignaling = false;
23898 break;
23899 }
23900
23901 return SSECC;
23902}
23903
23904/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23905/// concatenate the result back.
23907 SelectionDAG &DAG, const SDLoc &dl) {
23908 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23909 "Unsupported VTs!");
23910 SDValue CC = DAG.getCondCode(Cond);
23911
23912 // Extract the LHS Lo/Hi vectors
23913 SDValue LHS1, LHS2;
23914 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23915
23916 // Extract the RHS Lo/Hi vectors
23917 SDValue RHS1, RHS2;
23918 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23919
23920 // Issue the operation on the smaller types and concatenate the result back
23921 EVT LoVT, HiVT;
23922 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23923 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23924 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23925 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23926}
23927
23929 SelectionDAG &DAG) {
23930 SDValue Op0 = Op.getOperand(0);
23931 SDValue Op1 = Op.getOperand(1);
23932 SDValue CC = Op.getOperand(2);
23933 MVT VT = Op.getSimpleValueType();
23934 assert(VT.getVectorElementType() == MVT::i1 &&
23935 "Cannot set masked compare for this operation");
23936
23937 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23938
23939 // Prefer SETGT over SETLT.
23940 if (SetCCOpcode == ISD::SETLT) {
23941 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23942 std::swap(Op0, Op1);
23943 }
23944
23945 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23946}
23947
23948/// Given a buildvector constant, return a new vector constant with each element
23949/// incremented or decremented. If incrementing or decrementing would result in
23950/// unsigned overflow or underflow or this is not a simple vector constant,
23951/// return an empty value.
23953 bool NSW) {
23954 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23955 if (!BV || !V.getValueType().isSimple())
23956 return SDValue();
23957
23958 MVT VT = V.getSimpleValueType();
23959 MVT EltVT = VT.getVectorElementType();
23960 unsigned NumElts = VT.getVectorNumElements();
23962 SDLoc DL(V);
23963 for (unsigned i = 0; i < NumElts; ++i) {
23964 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23965 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23966 return SDValue();
23967
23968 // Avoid overflow/underflow.
23969 const APInt &EltC = Elt->getAPIntValue();
23970 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23971 return SDValue();
23972 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23973 (!IsInc && EltC.isMinSignedValue())))
23974 return SDValue();
23975
23976 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23977 }
23978
23979 return DAG.getBuildVector(VT, DL, NewVecC);
23980}
23981
23982/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23983/// Op0 u<= Op1:
23984/// t = psubus Op0, Op1
23985/// pcmpeq t, <0..0>
23987 ISD::CondCode Cond, const SDLoc &dl,
23988 const X86Subtarget &Subtarget,
23989 SelectionDAG &DAG) {
23990 if (!Subtarget.hasSSE2())
23991 return SDValue();
23992
23993 MVT VET = VT.getVectorElementType();
23994 if (VET != MVT::i8 && VET != MVT::i16)
23995 return SDValue();
23996
23997 switch (Cond) {
23998 default:
23999 return SDValue();
24000 case ISD::SETULT: {
24001 // If the comparison is against a constant we can turn this into a
24002 // setule. With psubus, setule does not require a swap. This is
24003 // beneficial because the constant in the register is no longer
24004 // destructed as the destination so it can be hoisted out of a loop.
24005 // Only do this pre-AVX since vpcmp* is no longer destructive.
24006 if (Subtarget.hasAVX())
24007 return SDValue();
24008 SDValue ULEOp1 =
24009 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24010 if (!ULEOp1)
24011 return SDValue();
24012 Op1 = ULEOp1;
24013 break;
24014 }
24015 case ISD::SETUGT: {
24016 // If the comparison is against a constant, we can turn this into a setuge.
24017 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24018 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24019 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24020 SDValue UGEOp1 =
24021 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24022 if (!UGEOp1)
24023 return SDValue();
24024 Op1 = Op0;
24025 Op0 = UGEOp1;
24026 break;
24027 }
24028 // Psubus is better than flip-sign because it requires no inversion.
24029 case ISD::SETUGE:
24030 std::swap(Op0, Op1);
24031 break;
24032 case ISD::SETULE:
24033 break;
24034 }
24035
24036 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24037 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24038 DAG.getConstant(0, dl, VT));
24039}
24040
24041static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24042 SelectionDAG &DAG) {
24043 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24044 Op.getOpcode() == ISD::STRICT_FSETCCS;
24045 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24046 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24047 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24048 MVT VT = Op->getSimpleValueType(0);
24050 MVT OpVT = Op0.getSimpleValueType();
24051 SDLoc dl(Op);
24052
24053 if (OpVT.isFloatingPoint()) {
24054 MVT EltVT = OpVT.getVectorElementType();
24055 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24056 EltVT == MVT::f64);
24057
24058 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24059 if (isSoftF16(EltVT, Subtarget)) {
24060 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24061 return SDValue();
24062
24063 // Break 256-bit FP vector compare into smaller ones.
24064 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24065 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24066
24067 // Break 512-bit FP vector compare into smaller ones.
24068 if (OpVT.is512BitVector())
24069 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24070
24071 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24072 if (IsStrict) {
24073 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24074 {Chain, Op0});
24075 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24076 {Chain, Op1});
24077 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24078 {Chain, Op0, Op1, CC});
24079 }
24080 MVT DVT = VT.getVectorElementType() == MVT::i16
24081 ? VT.changeVectorElementType(MVT::i32)
24082 : VT;
24083 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24085 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24086 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24087 }
24088
24089 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24090
24091 // If we have a strict compare with a vXi1 result and the input is 128/256
24092 // bits we can't use a masked compare unless we have VLX. If we use a wider
24093 // compare like we do for non-strict, we might trigger spurious exceptions
24094 // from the upper elements. Instead emit a AVX compare and convert to mask.
24095 unsigned Opc;
24096 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24097 (!IsStrict || Subtarget.hasVLX() ||
24099#ifndef NDEBUG
24100 unsigned Num = VT.getVectorNumElements();
24101 assert(Num <= 16 ||
24102 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24103#endif
24104 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24105 } else {
24106 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24107 // The SSE/AVX packed FP comparison nodes are defined with a
24108 // floating-point vector result that matches the operand type. This allows
24109 // them to work with an SSE1 target (integer vector types are not legal).
24110 VT = Op0.getSimpleValueType();
24111 }
24112
24113 SDValue Cmp;
24114 bool IsAlwaysSignaling;
24115 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24116 if (!Subtarget.hasAVX()) {
24117 // TODO: We could use following steps to handle a quiet compare with
24118 // signaling encodings.
24119 // 1. Get ordered masks from a quiet ISD::SETO
24120 // 2. Use the masks to mask potential unordered elements in operand A, B
24121 // 3. Get the compare results of masked A, B
24122 // 4. Calculating final result using the mask and result from 3
24123 // But currently, we just fall back to scalar operations.
24124 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24125 return SDValue();
24126
24127 // Insert an extra signaling instruction to raise exception.
24128 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24129 SDValue SignalCmp = DAG.getNode(
24130 Opc, dl, {VT, MVT::Other},
24131 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24132 // FIXME: It seems we need to update the flags of all new strict nodes.
24133 // Otherwise, mayRaiseFPException in MI will return false due to
24134 // NoFPExcept = false by default. However, I didn't find it in other
24135 // patches.
24136 SignalCmp->setFlags(Op->getFlags());
24137 Chain = SignalCmp.getValue(1);
24138 }
24139
24140 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24141 // emit two comparisons and a logic op to tie them together.
24142 if (!cheapX86FSETCC_SSE(Cond)) {
24143 // LLVM predicate is SETUEQ or SETONE.
24144 unsigned CC0, CC1;
24145 unsigned CombineOpc;
24146 if (Cond == ISD::SETUEQ) {
24147 CC0 = 3; // UNORD
24148 CC1 = 0; // EQ
24149 CombineOpc = X86ISD::FOR;
24150 } else {
24152 CC0 = 7; // ORD
24153 CC1 = 4; // NEQ
24154 CombineOpc = X86ISD::FAND;
24155 }
24156
24157 SDValue Cmp0, Cmp1;
24158 if (IsStrict) {
24159 Cmp0 = DAG.getNode(
24160 Opc, dl, {VT, MVT::Other},
24161 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24162 Cmp1 = DAG.getNode(
24163 Opc, dl, {VT, MVT::Other},
24164 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24165 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24166 Cmp1.getValue(1));
24167 } else {
24168 Cmp0 = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24170 Cmp1 = DAG.getNode(
24171 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24172 }
24173 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24174 } else {
24175 if (IsStrict) {
24176 Cmp = DAG.getNode(
24177 Opc, dl, {VT, MVT::Other},
24178 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24179 Chain = Cmp.getValue(1);
24180 } else
24181 Cmp = DAG.getNode(
24182 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24183 }
24184 } else {
24185 // Handle all other FP comparisons here.
24186 if (IsStrict) {
24187 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24188 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24189 Cmp = DAG.getNode(
24190 Opc, dl, {VT, MVT::Other},
24191 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24192 Chain = Cmp.getValue(1);
24193 } else
24194 Cmp = DAG.getNode(
24195 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24196 }
24197
24198 if (VT.getFixedSizeInBits() >
24199 Op.getSimpleValueType().getFixedSizeInBits()) {
24200 // We emitted a compare with an XMM/YMM result. Finish converting to a
24201 // mask register using a vptestm.
24203 Cmp = DAG.getBitcast(CastVT, Cmp);
24204 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24205 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24206 } else {
24207 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24208 // the result type of SETCC. The bitcast is expected to be optimized
24209 // away during combining/isel.
24210 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24211 }
24212
24213 if (IsStrict)
24214 return DAG.getMergeValues({Cmp, Chain}, dl);
24215
24216 return Cmp;
24217 }
24218
24219 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24220
24221 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24222 assert(VTOp0 == Op1.getSimpleValueType() &&
24223 "Expected operands with same type!");
24225 "Invalid number of packed elements for source and destination!");
24226
24227 // The non-AVX512 code below works under the assumption that source and
24228 // destination types are the same.
24229 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24230 "Value types for source and destination must be the same!");
24231
24232 // The result is boolean, but operands are int/float
24233 if (VT.getVectorElementType() == MVT::i1) {
24234 // In AVX-512 architecture setcc returns mask with i1 elements,
24235 // But there is no compare instruction for i8 and i16 elements in KNL.
24236 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24237 "Unexpected operand type");
24238 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24239 }
24240
24241 // Lower using XOP integer comparisons.
24242 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24243 // Translate compare code to XOP PCOM compare mode.
24244 unsigned CmpMode = 0;
24245 switch (Cond) {
24246 // clang-format off
24247 default: llvm_unreachable("Unexpected SETCC condition");
24248 case ISD::SETULT:
24249 case ISD::SETLT: CmpMode = 0x00; break;
24250 case ISD::SETULE:
24251 case ISD::SETLE: CmpMode = 0x01; break;
24252 case ISD::SETUGT:
24253 case ISD::SETGT: CmpMode = 0x02; break;
24254 case ISD::SETUGE:
24255 case ISD::SETGE: CmpMode = 0x03; break;
24256 case ISD::SETEQ: CmpMode = 0x04; break;
24257 case ISD::SETNE: CmpMode = 0x05; break;
24258 // clang-format on
24259 }
24260
24261 // Are we comparing unsigned or signed integers?
24262 unsigned Opc =
24264
24265 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24266 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24267 }
24268
24269 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24270 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24272 SDValue BC0 = peekThroughBitcasts(Op0);
24273 if (BC0.getOpcode() == ISD::AND &&
24275 /*AllowUndefs=*/false)) {
24276 Cond = ISD::SETEQ;
24277 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24278 }
24279 }
24280
24281 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24282 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24283 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24285 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24286 unsigned BitWidth = VT.getScalarSizeInBits();
24287 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24288
24289 SDValue Result = Op0.getOperand(0);
24290 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24291 DAG.getConstant(ShiftAmt, dl, VT));
24292 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24293 DAG.getConstant(BitWidth - 1, dl, VT));
24294 return Result;
24295 }
24296 }
24297
24298 // Break 256-bit integer vector compare into smaller ones.
24299 if (VT.is256BitVector() && !Subtarget.hasInt256())
24300 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24301
24302 // Break 512-bit integer vector compare into smaller ones.
24303 // TODO: Try harder to use VPCMPx + VPMOV2x?
24304 if (VT.is512BitVector())
24305 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24306
24307 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24308 // not-of-PCMPEQ:
24309 // X != INT_MIN --> X >s INT_MIN
24310 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24311 // +X != 0 --> +X >s 0
24312 APInt ConstValue;
24313 if (Cond == ISD::SETNE &&
24314 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24315 if (ConstValue.isMinSignedValue())
24316 Cond = ISD::SETGT;
24317 else if (ConstValue.isMaxSignedValue())
24318 Cond = ISD::SETLT;
24319 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24320 Cond = ISD::SETGT;
24321 }
24322
24323 // If both operands are known non-negative, then an unsigned compare is the
24324 // same as a signed compare and there's no need to flip signbits.
24325 // TODO: We could check for more general simplifications here since we're
24326 // computing known bits.
24327 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24328 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24329
24330 // Special case: Use min/max operations for unsigned compares.
24331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24333 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24334 TLI.isOperationLegal(ISD::UMIN, VT)) {
24335 // If we have a constant operand, increment/decrement it and change the
24336 // condition to avoid an invert.
24337 if (Cond == ISD::SETUGT) {
24338 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24339 if (SDValue UGTOp1 =
24340 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24341 Op1 = UGTOp1;
24342 Cond = ISD::SETUGE;
24343 }
24344 }
24345 if (Cond == ISD::SETULT) {
24346 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24347 if (SDValue ULTOp1 =
24348 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24349 Op1 = ULTOp1;
24350 Cond = ISD::SETULE;
24351 }
24352 }
24353 bool Invert = false;
24354 unsigned Opc;
24355 switch (Cond) {
24356 // clang-format off
24357 default: llvm_unreachable("Unexpected condition code");
24358 case ISD::SETUGT: Invert = true; [[fallthrough]];
24359 case ISD::SETULE: Opc = ISD::UMIN; break;
24360 case ISD::SETULT: Invert = true; [[fallthrough]];
24361 case ISD::SETUGE: Opc = ISD::UMAX; break;
24362 // clang-format on
24363 }
24364
24365 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24366 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24367
24368 // If the logical-not of the result is required, perform that now.
24369 if (Invert)
24370 Result = DAG.getNOT(dl, Result, VT);
24371
24372 return Result;
24373 }
24374
24375 // Try to use SUBUS and PCMPEQ.
24376 if (FlipSigns)
24377 if (SDValue V =
24378 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24379 return V;
24380
24381 // We are handling one of the integer comparisons here. Since SSE only has
24382 // GT and EQ comparisons for integer, swapping operands and multiple
24383 // operations may be required for some comparisons.
24384 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24386 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24388 bool Invert = Cond == ISD::SETNE ||
24390
24391 if (Swap)
24392 std::swap(Op0, Op1);
24393
24394 // Check that the operation in question is available (most are plain SSE2,
24395 // but PCMPGTQ and PCMPEQQ have different requirements).
24396 if (VT == MVT::v2i64) {
24397 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24398 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24399
24400 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24401 // the odd elements over the even elements.
24402 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24403 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24404 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24405
24406 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24407 static const int MaskHi[] = { 1, 1, 3, 3 };
24408 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24409
24410 return DAG.getBitcast(VT, Result);
24411 }
24412
24413 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24414 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24415 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24416
24417 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24418 static const int MaskHi[] = { 1, 1, 3, 3 };
24419 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24420
24421 return DAG.getBitcast(VT, Result);
24422 }
24423
24424 // If the i64 elements are sign-extended enough to be representable as i32
24425 // then we can compare the lower i32 bits and splat.
24426 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24427 DAG.ComputeNumSignBits(Op1) > 32) {
24428 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24429 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24430
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 static const int MaskLo[] = {0, 0, 2, 2};
24433 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24434
24435 return DAG.getBitcast(VT, Result);
24436 }
24437
24438 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24439 // bits of the inputs before performing those operations. The lower
24440 // compare is always unsigned.
24441 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24442 : 0x0000000080000000ULL,
24443 dl, MVT::v2i64);
24444
24445 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24446 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24447
24448 // Cast everything to the right type.
24449 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24450 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24451
24452 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24453 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24454 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24455
24456 // Create masks for only the low parts/high parts of the 64 bit integers.
24457 static const int MaskHi[] = { 1, 1, 3, 3 };
24458 static const int MaskLo[] = { 0, 0, 2, 2 };
24459 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24460 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24461 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24462
24463 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24464 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24465
24466 if (Invert)
24467 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24468
24469 return DAG.getBitcast(VT, Result);
24470 }
24471
24472 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24473 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24474 // pcmpeqd + pshufd + pand.
24475 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24476
24477 // First cast everything to the right type.
24478 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24479 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24480
24481 // Do the compare.
24482 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24483
24484 // Make sure the lower and upper halves are both all-ones.
24485 static const int Mask[] = { 1, 0, 3, 2 };
24486 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24487 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24488
24489 if (Invert)
24490 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24491
24492 return DAG.getBitcast(VT, Result);
24493 }
24494 }
24495
24496 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24497 // bits of the inputs before performing those operations.
24498 if (FlipSigns) {
24499 MVT EltVT = VT.getVectorElementType();
24501 VT);
24502 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24503 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24504 }
24505
24506 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24507
24508 // If the logical-not of the result is required, perform that now.
24509 if (Invert)
24510 Result = DAG.getNOT(dl, Result, VT);
24511
24512 return Result;
24513}
24514
24515// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24517 const SDLoc &dl, SelectionDAG &DAG,
24518 const X86Subtarget &Subtarget,
24519 SDValue &X86CC) {
24520 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24521
24522 // Must be a bitcast from vXi1.
24523 if (Op0.getOpcode() != ISD::BITCAST)
24524 return SDValue();
24525
24526 Op0 = Op0.getOperand(0);
24527 MVT VT = Op0.getSimpleValueType();
24528 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24529 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24530 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24531 return SDValue();
24532
24533 X86::CondCode X86Cond;
24534 if (isNullConstant(Op1)) {
24535 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24536 } else if (isAllOnesConstant(Op1)) {
24537 // C flag is set for all ones.
24538 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24539 } else
24540 return SDValue();
24541
24542 // If the input is an AND, we can combine it's operands into the KTEST.
24543 bool KTestable = false;
24544 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24545 KTestable = true;
24546 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24547 KTestable = true;
24548 if (!isNullConstant(Op1))
24549 KTestable = false;
24550 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24551 SDValue LHS = Op0.getOperand(0);
24552 SDValue RHS = Op0.getOperand(1);
24553 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24554 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24555 }
24556
24557 // If the input is an OR, we can combine it's operands into the KORTEST.
24558 SDValue LHS = Op0;
24559 SDValue RHS = Op0;
24560 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24561 LHS = Op0.getOperand(0);
24562 RHS = Op0.getOperand(1);
24563 }
24564
24565 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24566 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24567}
24568
24569/// Emit flags for the given setcc condition and operands. Also returns the
24570/// corresponding X86 condition code constant in X86CC.
24571SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24572 ISD::CondCode CC, const SDLoc &dl,
24573 SelectionDAG &DAG,
24574 SDValue &X86CC) const {
24575 // Equality Combines.
24576 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24577 X86::CondCode X86CondCode;
24578
24579 // Optimize to BT if possible.
24580 // Lower (X & (1 << N)) == 0 to BT(X, N).
24581 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24582 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24583 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24584 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24585 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24586 return BT;
24587 }
24588 }
24589
24590 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24591 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24592 X86CondCode)) {
24593 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24594 return CmpZ;
24595 }
24596
24597 // Try to lower using KORTEST or KTEST.
24598 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24599 return Test;
24600
24601 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24602 // of these.
24603 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24604 // If the input is a setcc, then reuse the input setcc or use a new one
24605 // with the inverted condition.
24606 if (Op0.getOpcode() == X86ISD::SETCC) {
24607 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24608
24609 X86CC = Op0.getOperand(0);
24610 if (Invert) {
24611 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24612 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24613 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24614 }
24615
24616 return Op0.getOperand(1);
24617 }
24618 }
24619
24620 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24621 // overflow.
24622 if (isMinSignedConstant(Op1)) {
24623 EVT VT = Op0.getValueType();
24624 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24625 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24627 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24628 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24629 DAG.getConstant(0, dl, VT), Op0);
24630 return SDValue(Neg.getNode(), 1);
24631 }
24632 }
24633
24634 // Try to use the carry flag from the add in place of an separate CMP for:
24635 // (seteq (add X, -1), -1). Similar for setne.
24636 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24637 Op0.getOperand(1) == Op1) {
24638 if (isProfitableToUseFlagOp(Op0)) {
24639 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24640
24641 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24642 Op0.getOperand(1));
24643 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24644 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24645 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24646 return SDValue(New.getNode(), 1);
24647 }
24648 }
24649 }
24650
24652 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24653 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24654
24655 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24656 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24657 return EFLAGS;
24658}
24659
24660SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24661
24662 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24663 Op.getOpcode() == ISD::STRICT_FSETCCS;
24664 MVT VT = Op->getSimpleValueType(0);
24665
24666 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24667
24668 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24669 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24670 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24671 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24672 SDLoc dl(Op);
24673 ISD::CondCode CC =
24674 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24675
24676 if (isSoftF16(Op0.getValueType(), Subtarget))
24677 return SDValue();
24678
24679 // Handle f128 first, since one possible outcome is a normal integer
24680 // comparison which gets handled by emitFlagsForSetcc.
24681 if (Op0.getValueType() == MVT::f128) {
24682 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24683 Op.getOpcode() == ISD::STRICT_FSETCCS);
24684
24685 // If softenSetCCOperands returned a scalar, use it.
24686 if (!Op1.getNode()) {
24687 assert(Op0.getValueType() == Op.getValueType() &&
24688 "Unexpected setcc expansion!");
24689 if (IsStrict)
24690 return DAG.getMergeValues({Op0, Chain}, dl);
24691 return Op0;
24692 }
24693 }
24694
24695 if (Op0.getSimpleValueType().isInteger()) {
24696 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24697 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24698 // this may translate to less uops depending on uarch implementation. The
24699 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24700 // canonicalize to that CondCode.
24701 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24702 // encoding size - so it must either already be a i8 or i32 immediate, or it
24703 // shrinks down to that. We don't do this for any i64's to avoid additional
24704 // constant materializations.
24705 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24706 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24707 const APInt &Op1Val = Op1C->getAPIntValue();
24708 if (!Op1Val.isZero()) {
24709 // Ensure the constant+1 doesn't overflow.
24710 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24711 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24712 APInt Op1ValPlusOne = Op1Val + 1;
24713 if (Op1ValPlusOne.isSignedIntN(32) &&
24714 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24715 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24718 }
24719 }
24720 }
24721 }
24722
24723 SDValue X86CC;
24724 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24725 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24726 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24727 }
24728
24729 if (Subtarget.hasAVX10_2()) {
24730 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24731 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24732 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24733 if (Op0.getSimpleValueType() != MVT::f80) {
24734 SDValue Res = getSETCC(
24735 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737 }
24738 }
24739 }
24740 // Handle floating point.
24741 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24742 if (CondCode == X86::COND_INVALID)
24743 return SDValue();
24744
24745 SDValue EFLAGS;
24746 if (IsStrict) {
24747 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24748 EFLAGS =
24750 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24751 Chain = EFLAGS.getValue(1);
24752 } else {
24753 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24754 }
24755
24756 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24757 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24758 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24759}
24760
24761SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24762 SDValue LHS = Op.getOperand(0);
24763 SDValue RHS = Op.getOperand(1);
24764 SDValue Carry = Op.getOperand(2);
24765 SDValue Cond = Op.getOperand(3);
24766 SDLoc DL(Op);
24767
24768 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24770
24771 // Recreate the carry if needed.
24772 EVT CarryVT = Carry.getValueType();
24773 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24774 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24775
24776 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24777 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24778 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24779}
24780
24781// This function returns three things: the arithmetic computation itself
24782// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24783// flag and the condition code define the case in which the arithmetic
24784// computation overflows.
24785static std::pair<SDValue, SDValue>
24787 assert(Op.getResNo() == 0 && "Unexpected result number!");
24788 SDValue Value, Overflow;
24789 SDValue LHS = Op.getOperand(0);
24790 SDValue RHS = Op.getOperand(1);
24791 unsigned BaseOp = 0;
24792 SDLoc DL(Op);
24793 switch (Op.getOpcode()) {
24794 default: llvm_unreachable("Unknown ovf instruction!");
24795 case ISD::SADDO:
24796 BaseOp = X86ISD::ADD;
24797 Cond = X86::COND_O;
24798 break;
24799 case ISD::UADDO:
24800 BaseOp = X86ISD::ADD;
24802 break;
24803 case ISD::SSUBO:
24804 BaseOp = X86ISD::SUB;
24805 Cond = X86::COND_O;
24806 break;
24807 case ISD::USUBO:
24808 BaseOp = X86ISD::SUB;
24809 Cond = X86::COND_B;
24810 break;
24811 case ISD::SMULO:
24812 BaseOp = X86ISD::SMUL;
24813 Cond = X86::COND_O;
24814 break;
24815 case ISD::UMULO:
24816 BaseOp = X86ISD::UMUL;
24817 Cond = X86::COND_O;
24818 break;
24819 }
24820
24821 if (BaseOp) {
24822 // Also sets EFLAGS.
24823 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24824 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24825 Overflow = Value.getValue(1);
24826 }
24827
24828 return std::make_pair(Value, Overflow);
24829}
24830
24832 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24833 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24834 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24835 // has only one use.
24836 SDLoc DL(Op);
24838 SDValue Value, Overflow;
24839 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24840
24841 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24842 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24843 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24844}
24845
24846/// Return true if opcode is a X86 logical comparison.
24848 unsigned Opc = Op.getOpcode();
24849 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24850 Opc == X86ISD::FCMP)
24851 return true;
24852 if (Op.getResNo() == 1 &&
24853 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24855 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24856 return true;
24857
24858 return false;
24859}
24860
24862 if (V.getOpcode() != ISD::TRUNCATE)
24863 return false;
24864
24865 SDValue VOp0 = V.getOperand(0);
24866 unsigned InBits = VOp0.getValueSizeInBits();
24867 unsigned Bits = V.getValueSizeInBits();
24868 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24869}
24870
24871// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24873 unsigned X86CC, const SDLoc &DL,
24874 SelectionDAG &DAG,
24875 const X86Subtarget &Subtarget) {
24876 EVT CmpVT = CmpVal.getValueType();
24877 EVT VT = LHS.getValueType();
24878 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24879 return SDValue();
24880
24881 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24882 isOneConstant(CmpVal.getOperand(1))) {
24883 auto SplatLSB = [&](EVT SplatVT) {
24884 // we need mask of all zeros or ones with same size of the other
24885 // operands.
24886 SDValue Neg = CmpVal;
24887 if (CmpVT.bitsGT(SplatVT))
24888 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24889 else if (CmpVT.bitsLT(SplatVT))
24890 Neg = DAG.getNode(
24891 ISD::AND, DL, SplatVT,
24892 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24893 DAG.getConstant(1, DL, SplatVT));
24894 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24895 };
24896
24897 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24899 return SplatLSB(VT);
24900
24901 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24902 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24904 SDValue Mask = SplatLSB(VT);
24905 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24906 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24907 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24908 }
24909
24910 SDValue Src1, Src2;
24911 auto isIdentityPatternZero = [&]() {
24912 switch (RHS.getOpcode()) {
24913 default:
24914 break;
24915 case ISD::OR:
24916 case ISD::XOR:
24917 case ISD::ADD:
24918 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24919 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24920 Src2 = LHS;
24921 return true;
24922 }
24923 break;
24924 case ISD::SHL:
24925 case ISD::SRA:
24926 case ISD::SRL:
24927 case ISD::SUB:
24928 if (RHS.getOperand(0) == LHS) {
24929 Src1 = RHS.getOperand(1);
24930 Src2 = LHS;
24931 return true;
24932 }
24933 break;
24934 }
24935 return false;
24936 };
24937
24938 auto isIdentityPatternOnes = [&]() {
24939 switch (LHS.getOpcode()) {
24940 default:
24941 break;
24942 case ISD::AND:
24943 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24944 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24945 Src2 = RHS;
24946 return true;
24947 }
24948 break;
24949 }
24950 return false;
24951 };
24952
24953 // Convert 'identity' patterns (iff X is 0 or 1):
24954 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24960 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24961 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24962 SDValue Mask = SplatLSB(Src1.getValueType());
24963 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24964 Src1); // Mask & z
24965 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24966 }
24967 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24968 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24969 SDValue Mask = SplatLSB(VT);
24970 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24971 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24972 }
24973 }
24974
24975 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24978 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24979
24980 // 'X - 1' sets the carry flag if X == 0.
24981 // '0 - X' sets the carry flag if X != 0.
24982 // Convert the carry flag to a -1/0 mask with sbb:
24983 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24984 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24985 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24986 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24987 SDValue Sub;
24988 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24989 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24990 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24991 } else {
24992 SDValue One = DAG.getConstant(1, DL, CmpVT);
24993 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24994 }
24995 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24996 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24997 Sub.getValue(1));
24998 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24999 }
25000
25001 return SDValue();
25002}
25003
25004SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25005 bool AddTest = true;
25006 SDValue Cond = Op.getOperand(0);
25007 SDValue Op1 = Op.getOperand(1);
25008 SDValue Op2 = Op.getOperand(2);
25009 SDLoc DL(Op);
25010 MVT VT = Op1.getSimpleValueType();
25011 SDValue CC;
25012
25013 if (isSoftF16(VT, Subtarget)) {
25014 MVT NVT = VT.changeTypeToInteger();
25015 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25016 DAG.getBitcast(NVT, Op1),
25017 DAG.getBitcast(NVT, Op2)));
25018 }
25019
25020 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25021 // are available or VBLENDV if AVX is available.
25022 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25023 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25024 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25025 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25026 bool IsAlwaysSignaling;
25027 unsigned SSECC =
25028 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25029 CondOp0, CondOp1, IsAlwaysSignaling);
25030
25031 if (Subtarget.hasAVX512()) {
25032 SDValue Cmp =
25033 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25034 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25035 assert(!VT.isVector() && "Not a scalar type?");
25036 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25037 }
25038
25039 if (SSECC < 8 || Subtarget.hasAVX()) {
25040 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25041 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25042
25043 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25044 // instead of 3 logic instructions for size savings and potentially speed.
25045 // Unfortunately, there is no scalar form of VBLENDV.
25046 //
25047 // If either operand is a +0.0 constant, don't try this. We can expect to
25048 // optimize away at least one of the logic instructions later in that
25049 // case, so that sequence would be faster than a variable blend.
25050 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25051 !isNullFPConstant(Op2)) {
25052 // Convert to vectors, do a VSELECT, and convert back to scalar.
25053 // All of the conversions should be optimized away.
25054 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25055 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25056 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25057 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25058
25059 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25060 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25061
25062 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25063
25064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25065 DAG.getVectorIdxConstant(0, DL));
25066 }
25067 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25068 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25069 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25070 }
25071 }
25072
25073 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25074 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25075 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25076 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25077 }
25078
25079 if (Cond.getOpcode() == ISD::SETCC &&
25080 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25081 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25082 Cond = NewCond;
25083 // If the condition was updated, it's possible that the operands of the
25084 // select were also updated (for example, EmitTest has a RAUW). Refresh
25085 // the local references to the select operands in case they got stale.
25086 Op1 = Op.getOperand(1);
25087 Op2 = Op.getOperand(2);
25088 }
25089 }
25090
25091 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25092 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25093 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25094 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25095 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25096 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25097 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25098 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25099 if (Cond.getOpcode() == X86ISD::SETCC &&
25100 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25101 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25102 SDValue Cmp = Cond.getOperand(1);
25103 SDValue CmpOp0 = Cmp.getOperand(0);
25104 unsigned CondCode = Cond.getConstantOperandVal(0);
25105
25106 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25107 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25108 // handle to keep the CMP with 0. This should be removed by
25109 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25110 // cttz_zero_undef.
25111 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25112 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25113 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25114 };
25115 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25116 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25117 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25118 // Keep Cmp.
25119 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25120 DL, DAG, Subtarget)) {
25121 return R;
25122 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25123 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25124 ((CondCode == X86::COND_S) || // smin(x, 0)
25125 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25126 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25127 //
25128 // If the comparison is testing for a positive value, we have to invert
25129 // the sign bit mask, so only do that transform if the target has a
25130 // bitwise 'and not' instruction (the invert is free).
25131 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25132 unsigned ShCt = VT.getSizeInBits() - 1;
25133 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25134 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25135 if (CondCode == X86::COND_G)
25136 Shift = DAG.getNOT(DL, Shift, VT);
25137 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25138 }
25139 }
25140
25141 // Look past (and (setcc_carry (cmp ...)), 1).
25142 if (Cond.getOpcode() == ISD::AND &&
25143 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25144 isOneConstant(Cond.getOperand(1)))
25145 Cond = Cond.getOperand(0);
25146
25147 // Attempt to fold "raw cond" cases by treating them as:
25148 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25149 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25150 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25151 Subtarget))
25152 return R;
25153
25154 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25155 // setting operand in place of the X86ISD::SETCC.
25156 unsigned CondOpcode = Cond.getOpcode();
25157 if (CondOpcode == X86ISD::SETCC ||
25158 CondOpcode == X86ISD::SETCC_CARRY) {
25159 CC = Cond.getOperand(0);
25160
25161 SDValue Cmp = Cond.getOperand(1);
25162 bool IllegalFPCMov = false;
25163 if (VT.isFloatingPoint() && !VT.isVector() &&
25164 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25165 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25166
25167 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25168 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25169 Cond = Cmp;
25170 AddTest = false;
25171 }
25172 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25173 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25174 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25175 SDValue Value;
25176 X86::CondCode X86Cond;
25177 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25178
25179 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25180 AddTest = false;
25181 }
25182
25183 if (AddTest) {
25184 // Look past the truncate if the high bits are known zero.
25186 Cond = Cond.getOperand(0);
25187
25188 // We know the result of AND is compared against zero. Try to match
25189 // it to BT.
25190 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25191 X86::CondCode X86CondCode;
25192 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25193 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25194 Cond = BT;
25195 AddTest = false;
25196 }
25197 }
25198 }
25199
25200 if (AddTest) {
25201 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25202 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25203 }
25204
25205 // a < b ? -1 : 0 -> RES = ~setcc_carry
25206 // a < b ? 0 : -1 -> RES = setcc_carry
25207 // a >= b ? -1 : 0 -> RES = setcc_carry
25208 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25209 if (Cond.getOpcode() == X86ISD::SUB) {
25210 unsigned CondCode = CC->getAsZExtVal();
25211
25212 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25213 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25214 (isNullConstant(Op1) || isNullConstant(Op2))) {
25215 SDValue Res =
25216 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25217 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25218 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25219 return DAG.getNOT(DL, Res, Res.getValueType());
25220 return Res;
25221 }
25222 }
25223
25224 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25225 // widen the cmov and push the truncate through. This avoids introducing a new
25226 // branch during isel and doesn't add any extensions.
25227 if (Op.getValueType() == MVT::i8 &&
25228 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25229 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25230 if (T1.getValueType() == T2.getValueType() &&
25231 // Exclude CopyFromReg to avoid partial register stalls.
25232 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25233 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25234 CC, Cond);
25235 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25236 }
25237 }
25238
25239 // Or finally, promote i8 cmovs if we have CMOV,
25240 // or i16 cmovs if it won't prevent folding a load.
25241 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25242 // legal, but EmitLoweredSelect() can not deal with these extensions
25243 // being inserted between two CMOV's. (in i16 case too TBN)
25244 // https://bugs.llvm.org/show_bug.cgi?id=40974
25245 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25246 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25247 !X86::mayFoldLoad(Op2, Subtarget))) {
25248 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25249 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25250 SDValue Ops[] = { Op2, Op1, CC, Cond };
25251 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25252 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25253 }
25254
25255 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25256 // condition is true.
25257 SDValue Ops[] = { Op2, Op1, CC, Cond };
25258 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25259}
25260
25262 const X86Subtarget &Subtarget,
25263 SelectionDAG &DAG) {
25264 MVT VT = Op->getSimpleValueType(0);
25265 SDValue In = Op->getOperand(0);
25266 MVT InVT = In.getSimpleValueType();
25267 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25268 MVT VTElt = VT.getVectorElementType();
25269 unsigned NumElts = VT.getVectorNumElements();
25270
25271 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25272 MVT ExtVT = VT;
25273 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25274 // If v16i32 is to be avoided, we'll need to split and concatenate.
25275 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25276 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25277
25278 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25279 }
25280
25281 // Widen to 512-bits if VLX is not supported.
25282 MVT WideVT = ExtVT;
25283 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25284 NumElts *= 512 / ExtVT.getSizeInBits();
25285 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25286 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25287 DAG.getVectorIdxConstant(0, dl));
25288 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25289 }
25290
25291 SDValue V;
25292 MVT WideEltVT = WideVT.getVectorElementType();
25293 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25294 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25295 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25296 } else {
25297 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25298 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25299 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25300 }
25301
25302 // Truncate if we had to extend i16/i8 above.
25303 if (VT != ExtVT) {
25304 WideVT = MVT::getVectorVT(VTElt, NumElts);
25305 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25306 }
25307
25308 // Extract back to 128/256-bit if we widened.
25309 if (WideVT != VT)
25310 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25311 DAG.getVectorIdxConstant(0, dl));
25312
25313 return V;
25314}
25315
25317 SelectionDAG &DAG) {
25318 SDValue In = Op->getOperand(0);
25319 MVT InVT = In.getSimpleValueType();
25320 SDLoc DL(Op);
25321
25322 if (InVT.getVectorElementType() == MVT::i1)
25323 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25324
25325 assert(Subtarget.hasAVX() && "Expected AVX support");
25326 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25327}
25328
25329// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25330// For sign extend this needs to handle all vector sizes and SSE4.1 and
25331// non-SSE4.1 targets. For zero extend this should only handle inputs of
25332// MVT::v64i8 when BWI is not supported, but AVX512 is.
25334 const X86Subtarget &Subtarget,
25335 SelectionDAG &DAG) {
25336 SDValue In = Op->getOperand(0);
25337 MVT VT = Op->getSimpleValueType(0);
25338 MVT InVT = In.getSimpleValueType();
25339
25340 MVT SVT = VT.getVectorElementType();
25341 MVT InSVT = InVT.getVectorElementType();
25343
25344 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25345 return SDValue();
25346 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25347 return SDValue();
25348 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25349 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25350 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25351 return SDValue();
25352
25353 SDLoc dl(Op);
25354 unsigned Opc = Op.getOpcode();
25355 unsigned NumElts = VT.getVectorNumElements();
25356
25357 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25358 // For 512-bit vectors, we need 128-bits or 256-bits.
25359 if (InVT.getSizeInBits() > 128) {
25360 // Input needs to be at least the same number of elements as output, and
25361 // at least 128-bits.
25362 int InSize = InSVT.getSizeInBits() * NumElts;
25363 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25364 InVT = In.getSimpleValueType();
25365 }
25366
25367 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25368 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25369 // need to be handled here for 256/512-bit results.
25370 if (Subtarget.hasInt256()) {
25371 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25372
25373 if (InVT.getVectorNumElements() != NumElts)
25374 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25375
25376 // FIXME: Apparently we create inreg operations that could be regular
25377 // extends.
25378 unsigned ExtOpc =
25381 return DAG.getNode(ExtOpc, dl, VT, In);
25382 }
25383
25384 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25385 if (Subtarget.hasAVX()) {
25386 assert(VT.is256BitVector() && "256-bit vector expected");
25387 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25388 int HalfNumElts = HalfVT.getVectorNumElements();
25389
25390 unsigned NumSrcElts = InVT.getVectorNumElements();
25391 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25392 for (int i = 0; i != HalfNumElts; ++i)
25393 HiMask[i] = HalfNumElts + i;
25394
25395 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25396 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25397 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25398 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25399 }
25400
25401 // We should only get here for sign extend.
25402 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25403 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25404 unsigned InNumElts = InVT.getVectorNumElements();
25405
25406 // If the source elements are already all-signbits, we don't need to extend,
25407 // just splat the elements.
25408 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25409 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25410 unsigned Scale = InNumElts / NumElts;
25411 SmallVector<int, 16> ShuffleMask;
25412 for (unsigned I = 0; I != NumElts; ++I)
25413 ShuffleMask.append(Scale, I);
25414 return DAG.getBitcast(VT,
25415 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25416 }
25417
25418 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25419 SDValue Curr = In;
25420 SDValue SignExt = Curr;
25421
25422 // As SRAI is only available on i16/i32 types, we expand only up to i32
25423 // and handle i64 separately.
25424 if (InVT != MVT::v4i32) {
25425 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25426
25427 unsigned DestWidth = DestVT.getScalarSizeInBits();
25428 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25429 unsigned DestElts = DestVT.getVectorNumElements();
25430
25431 // Build a shuffle mask that takes each input element and places it in the
25432 // MSBs of the new element size.
25433 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25434 for (unsigned i = 0; i != DestElts; ++i)
25435 Mask[i * Scale + (Scale - 1)] = i;
25436
25437 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25438 Curr = DAG.getBitcast(DestVT, Curr);
25439
25440 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25441 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25442 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25443 }
25444
25445 if (VT == MVT::v2i64) {
25446 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25447 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25448 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25449 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25450 SignExt = DAG.getBitcast(VT, SignExt);
25451 }
25452
25453 return SignExt;
25454}
25455
25457 SelectionDAG &DAG) {
25458 MVT VT = Op->getSimpleValueType(0);
25459 SDValue In = Op->getOperand(0);
25460 MVT InVT = In.getSimpleValueType();
25461 SDLoc dl(Op);
25462
25463 if (InVT.getVectorElementType() == MVT::i1)
25464 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25465
25466 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25468 "Expected same number of elements");
25469 assert((VT.getVectorElementType() == MVT::i16 ||
25470 VT.getVectorElementType() == MVT::i32 ||
25471 VT.getVectorElementType() == MVT::i64) &&
25472 "Unexpected element type");
25473 assert((InVT.getVectorElementType() == MVT::i8 ||
25474 InVT.getVectorElementType() == MVT::i16 ||
25475 InVT.getVectorElementType() == MVT::i32) &&
25476 "Unexpected element type");
25477
25478 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25479 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25480 return splitVectorIntUnary(Op, DAG, dl);
25481 }
25482
25483 if (Subtarget.hasInt256())
25484 return Op;
25485
25486 // Optimize vectors in AVX mode
25487 // Sign extend v8i16 to v8i32 and
25488 // v4i32 to v4i64
25489 //
25490 // Divide input vector into two parts
25491 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25492 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25493 // concat the vectors to original VT
25494 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25495 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25496
25497 unsigned NumElems = InVT.getVectorNumElements();
25498 SmallVector<int,8> ShufMask(NumElems, -1);
25499 for (unsigned i = 0; i != NumElems/2; ++i)
25500 ShufMask[i] = i + NumElems/2;
25501
25502 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25503 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25504
25505 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25506}
25507
25508/// Change a vector store into a pair of half-size vector stores.
25510 SDValue StoredVal = Store->getValue();
25511 assert((StoredVal.getValueType().is256BitVector() ||
25512 StoredVal.getValueType().is512BitVector()) &&
25513 "Expecting 256/512-bit op");
25514
25515 // Splitting volatile memory ops is not allowed unless the operation was not
25516 // legal to begin with. Assume the input store is legal (this transform is
25517 // only used for targets with AVX). Note: It is possible that we have an
25518 // illegal type like v2i128, and so we could allow splitting a volatile store
25519 // in that case if that is important.
25520 if (!Store->isSimple())
25521 return SDValue();
25522
25523 SDLoc DL(Store);
25524 SDValue Value0, Value1;
25525 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25526 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25527 SDValue Ptr0 = Store->getBasePtr();
25528 SDValue Ptr1 =
25529 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25530 SDValue Ch0 =
25531 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25532 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25533 SDValue Ch1 =
25534 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25535 Store->getPointerInfo().getWithOffset(HalfOffset),
25536 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25537 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25538}
25539
25540/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25541/// type.
25543 SelectionDAG &DAG) {
25544 SDValue StoredVal = Store->getValue();
25545 assert(StoreVT.is128BitVector() &&
25546 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25547 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25548
25549 // Splitting volatile memory ops is not allowed unless the operation was not
25550 // legal to begin with. We are assuming the input op is legal (this transform
25551 // is only used for targets with AVX).
25552 if (!Store->isSimple())
25553 return SDValue();
25554
25555 MVT StoreSVT = StoreVT.getScalarType();
25556 unsigned NumElems = StoreVT.getVectorNumElements();
25557 unsigned ScalarSize = StoreSVT.getStoreSize();
25558
25559 SDLoc DL(Store);
25561 for (unsigned i = 0; i != NumElems; ++i) {
25562 unsigned Offset = i * ScalarSize;
25563 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25565 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25566 DAG.getVectorIdxConstant(i, DL));
25567 SDValue Ch =
25568 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25569 Store->getPointerInfo().getWithOffset(Offset),
25570 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25571 Stores.push_back(Ch);
25572 }
25573 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25574}
25575
25576static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25577 SelectionDAG &DAG) {
25578 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25579 SDLoc dl(St);
25580 SDValue StoredVal = St->getValue();
25581
25582 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25583 if (StoredVal.getValueType().isVector() &&
25584 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25585 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25586 assert(NumElts <= 8 && "Unexpected VT");
25587 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25588 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25589 "Expected AVX512F without AVX512DQI");
25590
25591 // We must pad with zeros to ensure we store zeroes to any unused bits.
25592 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25593 DAG.getUNDEF(MVT::v16i1), StoredVal,
25594 DAG.getVectorIdxConstant(0, dl));
25595 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25596 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25597 // Make sure we store zeros in the extra bits.
25598 if (NumElts < 8)
25599 StoredVal = DAG.getZeroExtendInReg(
25600 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25601
25602 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25603 St->getPointerInfo(), St->getBaseAlign(),
25604 St->getMemOperand()->getFlags());
25605 }
25606
25607 if (St->isTruncatingStore())
25608 return SDValue();
25609
25610 // If this is a 256/512-bit store of concatenated ops, we are better off
25611 // splitting that store into two half-size stores. This avoids spurious use of
25612 // concatenated ops and each half can execute independently. Some cores would
25613 // split the op into halves anyway, so the concat is purely an extra op.
25614 MVT StoreVT = StoredVal.getSimpleValueType();
25615 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25616 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25617 return splitVectorStore(St, DAG);
25618 return SDValue();
25619 }
25620
25621 if (StoreVT.is32BitVector())
25622 return SDValue();
25623
25624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25625 assert(StoreVT.is64BitVector() && "Unexpected VT");
25626 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25628 "Unexpected type action!");
25629
25630 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25631 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25632 DAG.getUNDEF(StoreVT));
25633
25634 if (Subtarget.hasSSE2()) {
25635 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25636 // and store it.
25637 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25638 MVT CastVT = MVT::getVectorVT(StVT, 2);
25639 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25640 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25641 DAG.getVectorIdxConstant(0, dl));
25642
25643 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25644 St->getPointerInfo(), St->getBaseAlign(),
25645 St->getMemOperand()->getFlags());
25646 }
25647 assert(Subtarget.hasSSE1() && "Expected SSE");
25648 SDVTList Tys = DAG.getVTList(MVT::Other);
25649 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25650 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25651 St->getMemOperand());
25652}
25653
25654// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25655// may emit an illegal shuffle but the expansion is still better than scalar
25656// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25657// we'll emit a shuffle and a arithmetic shift.
25658// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25659// TODO: It is possible to support ZExt by zeroing the undef values during
25660// the shuffle phase or after the shuffle.
25661static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25662 SelectionDAG &DAG) {
25663 MVT RegVT = Op.getSimpleValueType();
25664 assert(RegVT.isVector() && "We only custom lower vector loads.");
25665 assert(RegVT.isInteger() &&
25666 "We only custom lower integer vector loads.");
25667
25668 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25669 SDLoc dl(Ld);
25670
25671 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25672 if (RegVT.getVectorElementType() == MVT::i1) {
25673 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25674 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25675 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25676 "Expected AVX512F without AVX512DQI");
25677
25678 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25679 Ld->getPointerInfo(), Ld->getBaseAlign(),
25680 Ld->getMemOperand()->getFlags());
25681
25682 // Replace chain users with the new chain.
25683 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25684
25685 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25686 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25687 DAG.getBitcast(MVT::v16i1, Val),
25688 DAG.getVectorIdxConstant(0, dl));
25689 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25690 }
25691
25692 return SDValue();
25693}
25694
25695/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25696/// each of which has no other use apart from the AND / OR.
25697static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25698 Opc = Op.getOpcode();
25699 if (Opc != ISD::OR && Opc != ISD::AND)
25700 return false;
25701 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25702 Op.getOperand(0).hasOneUse() &&
25703 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25704 Op.getOperand(1).hasOneUse());
25705}
25706
25707SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25708 SDValue Chain = Op.getOperand(0);
25709 SDValue Cond = Op.getOperand(1);
25710 SDValue Dest = Op.getOperand(2);
25711 SDLoc dl(Op);
25712
25713 // Bail out when we don't have native compare instructions.
25714 if (Cond.getOpcode() == ISD::SETCC &&
25715 Cond.getOperand(0).getValueType() != MVT::f128 &&
25716 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25717 SDValue LHS = Cond.getOperand(0);
25718 SDValue RHS = Cond.getOperand(1);
25719 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25720
25721 // Special case for
25722 // setcc([su]{add,sub,mul}o == 0)
25723 // setcc([su]{add,sub,mul}o != 1)
25725 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25727 SDValue Value, Overflow;
25728 X86::CondCode X86Cond;
25729 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25730
25731 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25732 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25733
25734 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25735 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25736 Overflow, Op->getFlags());
25737 }
25738
25739 if (LHS.getSimpleValueType().isInteger()) {
25740 SDValue CCVal;
25741 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25742 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25743 EFLAGS, Op->getFlags());
25744 }
25745
25746 if (CC == ISD::SETOEQ) {
25747 // For FCMP_OEQ, we can emit
25748 // two branches instead of an explicit AND instruction with a
25749 // separate test. However, we only do this if this block doesn't
25750 // have a fall-through edge, because this requires an explicit
25751 // jmp when the condition is false.
25752 if (Op.getNode()->hasOneUse()) {
25753 SDNode *User = *Op.getNode()->user_begin();
25754 // Look for an unconditional branch following this conditional branch.
25755 // We need this because we need to reverse the successors in order
25756 // to implement FCMP_OEQ.
25757 if (User->getOpcode() == ISD::BR) {
25758 SDValue FalseBB = User->getOperand(1);
25759 SDNode *NewBR =
25760 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25761 assert(NewBR == User);
25762 (void)NewBR;
25763 Dest = FalseBB;
25764
25765 SDValue Cmp =
25766 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25767 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25768 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25769 CCVal, Cmp, Op->getFlags());
25770 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25771 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25772 Cmp, Op->getFlags());
25773 }
25774 }
25775 } else if (CC == ISD::SETUNE) {
25776 // For FCMP_UNE, we can emit
25777 // two branches instead of an explicit OR instruction with a
25778 // separate test.
25779 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25780 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25781 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25782 Cmp, Op->getFlags());
25783 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25784 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25785 Cmp, Op->getFlags());
25786 } else {
25787 X86::CondCode X86Cond =
25788 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25789 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25790 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25791 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25792 Cmp, Op->getFlags());
25793 }
25794 }
25795
25797 SDValue Value, Overflow;
25798 X86::CondCode X86Cond;
25799 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25800
25801 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25802 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25803 Overflow, Op->getFlags());
25804 }
25805
25806 // Look past the truncate if the high bits are known zero.
25808 Cond = Cond.getOperand(0);
25809
25810 EVT CondVT = Cond.getValueType();
25811
25812 // Add an AND with 1 if we don't already have one.
25813 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25814 Cond =
25815 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25816
25817 SDValue LHS = Cond;
25818 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25819
25820 SDValue CCVal;
25821 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25822 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25823 Op->getFlags());
25824}
25825
25826// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25827// Calls to _alloca are needed to probe the stack when allocating more than 4k
25828// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25829// that the guard pages used by the OS virtual memory manager are allocated in
25830// correct sequence.
25831SDValue
25832X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25833 SelectionDAG &DAG) const {
25834 MachineFunction &MF = DAG.getMachineFunction();
25835 bool SplitStack = MF.shouldSplitStack();
25836 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25837 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25838 SplitStack || EmitStackProbeCall;
25839 SDLoc dl(Op);
25840
25841 // Get the inputs.
25842 SDNode *Node = Op.getNode();
25843 SDValue Chain = Op.getOperand(0);
25844 SDValue Size = Op.getOperand(1);
25845 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25846 EVT VT = Node->getValueType(0);
25847
25848 // Chain the dynamic stack allocation so that it doesn't modify the stack
25849 // pointer when other instructions are using the stack.
25850 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25851
25852 bool Is64Bit = Subtarget.is64Bit();
25853 MVT SPTy = Op.getValueType().getSimpleVT();
25854
25856 if (!Lower) {
25857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25859 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25860 " not tell us which reg is the stack pointer!");
25861
25862 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25863 const Align StackAlign = TFI.getStackAlign();
25864 if (hasInlineStackProbe(MF)) {
25865 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25866 {Chain, Size});
25867 Chain = Result.getValue(1);
25868 } else {
25869 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25870 Chain = SP.getValue(1);
25871 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25872 }
25873 if (Alignment && *Alignment > StackAlign)
25874 Result = DAG.getNode(
25875 ISD::AND, dl, VT, Result,
25876 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25878 } else if (SplitStack) {
25879 if (Is64Bit) {
25880 // The 64 bit implementation of segmented stacks needs to clobber both r10
25881 // r11. This makes it impossible to use it along with nested parameters.
25882 const Function &F = MF.getFunction();
25883 for (const auto &A : F.args()) {
25884 if (A.hasNestAttr())
25885 report_fatal_error("Cannot use segmented stacks with functions that "
25886 "have nested arguments.");
25887 }
25888 }
25889
25890 Result =
25891 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25892 Chain = Result.getValue(1);
25893 } else {
25894 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25895 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25896 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25897
25898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25899 Register SPReg = RegInfo->getStackRegister();
25900 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25901 Chain = SP.getValue(1);
25902
25903 if (Alignment) {
25904 SP = DAG.getNode(
25905 ISD::AND, dl, VT, SP.getValue(0),
25906 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25907 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25908 }
25909
25910 Result = SP;
25911 }
25912
25913 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25914
25915 SDValue Ops[2] = {Result, Chain};
25916 return DAG.getMergeValues(Ops, dl);
25917}
25918
25919SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25920 MachineFunction &MF = DAG.getMachineFunction();
25921 SDValue Ptr = Op.getOperand(1);
25922 EVT PtrVT = Ptr.getValueType();
25923
25924 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25925
25926 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25927 SDLoc DL(Op);
25928
25929 if (!Subtarget.is64Bit() ||
25930 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25931 // vastart just stores the address of the VarArgsFrameIndex slot into the
25932 // memory location argument.
25933 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25934 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25935 }
25936
25937 // __va_list_tag:
25938 // gp_offset (0 - 6 * 8)
25939 // fp_offset (48 - 48 + 8 * 16)
25940 // overflow_arg_area (point to parameters coming in memory).
25941 // reg_save_area
25943 SDValue FIN = Op.getOperand(1);
25944 // Store gp_offset
25945 SDValue Store = DAG.getStore(
25946 Op.getOperand(0), DL,
25947 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25948 MachinePointerInfo(SV));
25949 MemOps.push_back(Store);
25950
25951 // Store fp_offset
25952 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25953 Store = DAG.getStore(
25954 Op.getOperand(0), DL,
25955 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25956 MachinePointerInfo(SV, 4));
25957 MemOps.push_back(Store);
25958
25959 // Store ptr to overflow_arg_area
25960 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25961 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25962 Store =
25963 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25964 MemOps.push_back(Store);
25965
25966 // Store ptr to reg_save_area.
25967 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25968 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25969 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25970 Store = DAG.getStore(
25971 Op.getOperand(0), DL, RSFIN, FIN,
25972 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25973 MemOps.push_back(Store);
25974 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25975}
25976
25977SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25978 assert(Subtarget.is64Bit() &&
25979 "LowerVAARG only handles 64-bit va_arg!");
25980 assert(Op.getNumOperands() == 4);
25981
25982 MachineFunction &MF = DAG.getMachineFunction();
25983 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25984 // The Win64 ABI uses char* instead of a structure.
25985 return DAG.expandVAArg(Op.getNode());
25986
25987 SDValue Chain = Op.getOperand(0);
25988 SDValue SrcPtr = Op.getOperand(1);
25989 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25990 unsigned Align = Op.getConstantOperandVal(3);
25991 SDLoc dl(Op);
25992
25993 EVT ArgVT = Op.getNode()->getValueType(0);
25994 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25995 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25996 uint8_t ArgMode;
25997
25998 // Decide which area this value should be read from.
25999 // TODO: Implement the AMD64 ABI in its entirety. This simple
26000 // selection mechanism works only for the basic types.
26001 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26002 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26003 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26004 } else {
26005 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26006 "Unhandled argument type in LowerVAARG");
26007 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26008 }
26009
26010 if (ArgMode == 2) {
26011 // Make sure using fp_offset makes sense.
26012 assert(!Subtarget.useSoftFloat() &&
26013 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26014 Subtarget.hasSSE1());
26015 }
26016
26017 // Insert VAARG node into the DAG
26018 // VAARG returns two values: Variable Argument Address, Chain
26019 SDValue InstOps[] = {Chain, SrcPtr,
26020 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26021 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26022 DAG.getTargetConstant(Align, dl, MVT::i32)};
26023 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26024 SDValue VAARG = DAG.getMemIntrinsicNode(
26025 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26026 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26027 /*Alignment=*/std::nullopt,
26029 Chain = VAARG.getValue(1);
26030
26031 // Load the next argument and return it
26032 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26033}
26034
26035static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26036 SelectionDAG &DAG) {
26037 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26038 // where a va_list is still an i8*.
26039 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26040 if (Subtarget.isCallingConvWin64(
26042 // Probably a Win64 va_copy.
26043 return DAG.expandVACopy(Op.getNode());
26044
26045 SDValue Chain = Op.getOperand(0);
26046 SDValue DstPtr = Op.getOperand(1);
26047 SDValue SrcPtr = Op.getOperand(2);
26048 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26049 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26050 SDLoc DL(Op);
26051
26052 return DAG.getMemcpy(
26053 Chain, DL, DstPtr, SrcPtr,
26054 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26055 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26056 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26057 MachinePointerInfo(SrcSV));
26058}
26059
26060// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26061static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26062 switch (Opc) {
26063 case ISD::SHL:
26064 case X86ISD::VSHL:
26065 case X86ISD::VSHLI:
26066 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26067 case ISD::SRL:
26068 case X86ISD::VSRL:
26069 case X86ISD::VSRLI:
26070 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26071 case ISD::SRA:
26072 case X86ISD::VSRA:
26073 case X86ISD::VSRAI:
26074 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26075 }
26076 llvm_unreachable("Unknown target vector shift node");
26077}
26078
26079/// Handle vector element shifts where the shift amount is a constant.
26080/// Takes immediate version of shift as input.
26081static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26082 SDValue SrcOp, uint64_t ShiftAmt,
26083 SelectionDAG &DAG) {
26084 MVT ElementType = VT.getVectorElementType();
26085
26086 // Bitcast the source vector to the output type, this is mainly necessary for
26087 // vXi8/vXi64 shifts.
26088 if (VT != SrcOp.getSimpleValueType())
26089 SrcOp = DAG.getBitcast(VT, SrcOp);
26090
26091 // Fold this packed shift into its first operand if ShiftAmt is 0.
26092 if (ShiftAmt == 0)
26093 return SrcOp;
26094
26095 // Check for ShiftAmt >= element width
26096 if (ShiftAmt >= ElementType.getSizeInBits()) {
26097 if (Opc == X86ISD::VSRAI)
26098 ShiftAmt = ElementType.getSizeInBits() - 1;
26099 else
26100 return DAG.getConstant(0, dl, VT);
26101 }
26102
26104 && "Unknown target vector shift-by-constant node");
26105
26106 // Fold this packed vector shift into a build vector if SrcOp is a
26107 // vector of Constants or UNDEFs.
26109 unsigned ShiftOpc;
26110 switch (Opc) {
26111 default: llvm_unreachable("Unknown opcode!");
26112 case X86ISD::VSHLI:
26113 ShiftOpc = ISD::SHL;
26114 break;
26115 case X86ISD::VSRLI:
26116 ShiftOpc = ISD::SRL;
26117 break;
26118 case X86ISD::VSRAI:
26119 ShiftOpc = ISD::SRA;
26120 break;
26121 }
26122
26123 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26124 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26125 return C;
26126 }
26127
26128 return DAG.getNode(Opc, dl, VT, SrcOp,
26129 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26130}
26131
26132/// Handle vector element shifts by a splat shift amount
26133static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26134 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26135 const X86Subtarget &Subtarget,
26136 SelectionDAG &DAG) {
26137 MVT AmtVT = ShAmt.getSimpleValueType();
26138 assert(AmtVT.isVector() && "Vector shift type mismatch");
26139 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26140 "Illegal vector splat index");
26141
26142 // Move the splat element to the bottom element.
26143 if (ShAmtIdx != 0) {
26144 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26145 Mask[0] = ShAmtIdx;
26146 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26147 }
26148
26149 // Peek through any zext node if we can get back to a 128-bit source.
26150 if (AmtVT.getScalarSizeInBits() == 64 &&
26151 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26153 ShAmt.getOperand(0).getValueType().isSimple() &&
26154 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26155 ShAmt = ShAmt.getOperand(0);
26156 AmtVT = ShAmt.getSimpleValueType();
26157 }
26158
26159 // See if we can mask off the upper elements using the existing source node.
26160 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26161 // do this for vXi64 types.
26162 bool IsMasked = false;
26163 if (AmtVT.getScalarSizeInBits() < 64) {
26164 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26165 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26166 // If the shift amount has come from a scalar, then zero-extend the scalar
26167 // before moving to the vector.
26168 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26169 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26170 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26171 AmtVT = MVT::v4i32;
26172 IsMasked = true;
26173 } else if (ShAmt.getOpcode() == ISD::AND) {
26174 // See if the shift amount is already masked (e.g. for rotation modulo),
26175 // then we can zero-extend it by setting all the other mask elements to
26176 // zero.
26177 SmallVector<SDValue> MaskElts(
26178 AmtVT.getVectorNumElements(),
26179 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26180 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26181 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26182 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26183 {ShAmt.getOperand(1), Mask}))) {
26184 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26185 IsMasked = true;
26186 }
26187 }
26188 }
26189
26190 // Extract if the shift amount vector is larger than 128-bits.
26191 if (AmtVT.getSizeInBits() > 128) {
26192 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26193 AmtVT = ShAmt.getSimpleValueType();
26194 }
26195
26196 // Zero-extend bottom element to v2i64 vector type, either by extension or
26197 // shuffle masking.
26198 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26199 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26200 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26201 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26202 } else if (Subtarget.hasSSE41()) {
26203 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26204 MVT::v2i64, ShAmt);
26205 } else {
26206 SDValue ByteShift = DAG.getTargetConstant(
26207 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26208 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26209 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26210 ByteShift);
26211 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26212 ByteShift);
26213 }
26214 }
26215
26216 // Change opcode to non-immediate version.
26218
26219 // The return type has to be a 128-bit type with the same element
26220 // type as the input type.
26221 MVT EltVT = VT.getVectorElementType();
26222 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26223
26224 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26225 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26226}
26227
26228/// Return Mask with the necessary casting or extending
26229/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26230static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26231 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26232 const SDLoc &dl) {
26233
26234 if (isAllOnesConstant(Mask))
26235 return DAG.getConstant(1, dl, MaskVT);
26236 if (X86::isZeroNode(Mask))
26237 return DAG.getConstant(0, dl, MaskVT);
26238
26239 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26240
26241 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26242 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26243 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26244 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26245 SDValue Lo, Hi;
26246 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26247 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26248 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26250 } else {
26251 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26252 Mask.getSimpleValueType().getSizeInBits());
26253 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26254 // are extracted by EXTRACT_SUBVECTOR.
26255 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26256 DAG.getBitcast(BitcastVT, Mask),
26257 DAG.getVectorIdxConstant(0, dl));
26258 }
26259}
26260
26261/// Return (and \p Op, \p Mask) for compare instructions or
26262/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26263/// necessary casting or extending for \p Mask when lowering masking intrinsics
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 MVT VT = Op.getSimpleValueType();
26269 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26270 unsigned OpcodeSelect = ISD::VSELECT;
26271 SDLoc dl(Op);
26272
26273 if (isAllOnesConstant(Mask))
26274 return Op;
26275
26276 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277
26278 if (PreservedSrc.isUndef())
26279 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26280 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26281}
26282
26283/// Creates an SDNode for a predicated scalar operation.
26284/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26285/// The mask is coming as MVT::i8 and it should be transformed
26286/// to MVT::v1i1 while lowering masking intrinsics.
26287/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26288/// "X86select" instead of "vselect". We just can't create the "vselect" node
26289/// for a scalar instruction.
26291 SDValue PreservedSrc,
26292 const X86Subtarget &Subtarget,
26293 SelectionDAG &DAG) {
26294 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26295 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26296 return Op;
26297
26298 MVT VT = Op.getSimpleValueType();
26299 SDLoc dl(Op);
26300
26301 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26302 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26303 DAG.getBitcast(MVT::v8i1, Mask),
26304 DAG.getVectorIdxConstant(0, dl));
26305 if (Op.getOpcode() == X86ISD::FSETCCM ||
26306 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26307 Op.getOpcode() == X86ISD::VFPCLASSS)
26308 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26309
26310 if (PreservedSrc.isUndef())
26311 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26312
26313 if (MaskConst) {
26314 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26315 // Discard op and blend passthrough with scalar op src/dst.
26317 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26318 ShuffleMask[0] = VT.getVectorNumElements();
26319 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26320 ShuffleMask);
26321 }
26322
26323 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26324}
26325
26327 if (!Fn->hasPersonalityFn())
26329 "querying registration node size for function without personality");
26330 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26331 // WinEHStatePass for the full struct definition.
26332 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26333 case EHPersonality::MSVC_X86SEH: return 24;
26334 case EHPersonality::MSVC_CXX: return 16;
26335 default: break;
26336 }
26338 "can only recover FP for 32-bit MSVC EH personality functions");
26339}
26340
26341/// When the MSVC runtime transfers control to us, either to an outlined
26342/// function or when returning to a parent frame after catching an exception, we
26343/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26344/// Here's the math:
26345/// RegNodeBase = EntryEBP - RegNodeSize
26346/// ParentFP = RegNodeBase - ParentFrameOffset
26347/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26348/// subtracting the offset (negative on x86) takes us back to the parent FP.
26350 SDValue EntryEBP) {
26352 SDLoc dl;
26353
26354 // It's possible that the parent function no longer has a personality function
26355 // if the exceptional code was optimized away, in which case we just return
26356 // the incoming EBP.
26357 if (!Fn->hasPersonalityFn())
26358 return EntryEBP;
26359
26360 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26361 // registration, or the .set_setframe offset.
26364 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26365 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26366 SDValue ParentFrameOffset =
26367 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26368
26369 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26370 // prologue to RBP in the parent function.
26371 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26372 if (Subtarget.is64Bit())
26373 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26374
26375 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26376 // RegNodeBase = EntryEBP - RegNodeSize
26377 // ParentFP = RegNodeBase - ParentFrameOffset
26378 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26379 DAG.getConstant(RegNodeSize, dl, PtrVT));
26380 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26381}
26382
26383SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26384 SelectionDAG &DAG) const {
26385 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26386 auto isRoundModeCurDirection = [](SDValue Rnd) {
26387 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26388 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26389
26390 return false;
26391 };
26392 auto isRoundModeSAE = [](SDValue Rnd) {
26393 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26394 unsigned RC = C->getZExtValue();
26396 // Clear the NO_EXC bit and check remaining bits.
26398 // As a convenience we allow no other bits or explicitly
26399 // current direction.
26400 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26401 }
26402 }
26403
26404 return false;
26405 };
26406 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26407 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26408 RC = C->getZExtValue();
26410 // Clear the NO_EXC bit and check remaining bits.
26416 }
26417 }
26418
26419 return false;
26420 };
26421
26422 SDLoc dl(Op);
26423 unsigned IntNo = Op.getConstantOperandVal(0);
26424 MVT VT = Op.getSimpleValueType();
26425 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26426
26427 // Propagate flags from original node to transformed node(s).
26428 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26429
26430 if (IntrData) {
26431 switch(IntrData->Type) {
26432 case INTR_TYPE_1OP: {
26433 // We specify 2 possible opcodes for intrinsics with rounding modes.
26434 // First, we check if the intrinsic may have non-default rounding mode,
26435 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26436 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26437 if (IntrWithRoundingModeOpcode != 0) {
26438 SDValue Rnd = Op.getOperand(2);
26439 unsigned RC = 0;
26440 if (isRoundModeSAEToX(Rnd, RC))
26441 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26442 Op.getOperand(1),
26443 DAG.getTargetConstant(RC, dl, MVT::i32));
26444 if (!isRoundModeCurDirection(Rnd))
26445 return SDValue();
26446 }
26447 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26448 Op.getOperand(1));
26449 }
26450 case INTR_TYPE_1OP_SAE: {
26451 SDValue Sae = Op.getOperand(2);
26452
26453 unsigned Opc;
26454 if (isRoundModeCurDirection(Sae))
26455 Opc = IntrData->Opc0;
26456 else if (isRoundModeSAE(Sae))
26457 Opc = IntrData->Opc1;
26458 else
26459 return SDValue();
26460
26461 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26462 }
26463 case INTR_TYPE_2OP: {
26464 SDValue Src2 = Op.getOperand(2);
26465
26466 // We specify 2 possible opcodes for intrinsics with rounding modes.
26467 // First, we check if the intrinsic may have non-default rounding mode,
26468 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26469 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26470 if (IntrWithRoundingModeOpcode != 0) {
26471 SDValue Rnd = Op.getOperand(3);
26472 unsigned RC = 0;
26473 if (isRoundModeSAEToX(Rnd, RC))
26474 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26475 Op.getOperand(1), Src2,
26476 DAG.getTargetConstant(RC, dl, MVT::i32));
26477 if (!isRoundModeCurDirection(Rnd))
26478 return SDValue();
26479 }
26480
26481 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26482 Op.getOperand(1), Src2);
26483 }
26484 case INTR_TYPE_2OP_SAE: {
26485 SDValue Sae = Op.getOperand(3);
26486
26487 unsigned Opc;
26488 if (isRoundModeCurDirection(Sae))
26489 Opc = IntrData->Opc0;
26490 else if (isRoundModeSAE(Sae))
26491 Opc = IntrData->Opc1;
26492 else
26493 return SDValue();
26494
26495 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26496 Op.getOperand(2));
26497 }
26498 case INTR_TYPE_3OP:
26499 case INTR_TYPE_3OP_IMM8: {
26500 SDValue Src1 = Op.getOperand(1);
26501 SDValue Src2 = Op.getOperand(2);
26502 SDValue Src3 = Op.getOperand(3);
26503
26504 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26505 Src3.getValueType() != MVT::i8) {
26506 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26507 }
26508
26509 // We specify 2 possible opcodes for intrinsics with rounding modes.
26510 // First, we check if the intrinsic may have non-default rounding mode,
26511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26513 if (IntrWithRoundingModeOpcode != 0) {
26514 SDValue Rnd = Op.getOperand(4);
26515 unsigned RC = 0;
26516 if (isRoundModeSAEToX(Rnd, RC))
26517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26518 Src1, Src2, Src3,
26519 DAG.getTargetConstant(RC, dl, MVT::i32));
26520 if (!isRoundModeCurDirection(Rnd))
26521 return SDValue();
26522 }
26523
26524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26525 {Src1, Src2, Src3});
26526 }
26527 case INTR_TYPE_4OP_IMM8: {
26528 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26529 SDValue Src4 = Op.getOperand(4);
26530 if (Src4.getValueType() != MVT::i8) {
26531 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26532 }
26533
26534 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26535 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26536 Src4);
26537 }
26538 case INTR_TYPE_1OP_MASK: {
26539 SDValue Src = Op.getOperand(1);
26540 SDValue PassThru = Op.getOperand(2);
26541 SDValue Mask = Op.getOperand(3);
26542 // We add rounding mode to the Node when
26543 // - RC Opcode is specified and
26544 // - RC is not "current direction".
26545 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26546 if (IntrWithRoundingModeOpcode != 0) {
26547 SDValue Rnd = Op.getOperand(4);
26548 unsigned RC = 0;
26549 if (isRoundModeSAEToX(Rnd, RC))
26550 return getVectorMaskingNode(
26551 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26552 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26553 Mask, PassThru, Subtarget, DAG);
26554 if (!isRoundModeCurDirection(Rnd))
26555 return SDValue();
26556 }
26557 return getVectorMaskingNode(
26558 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26559 Subtarget, DAG);
26560 }
26562 SDValue Src = Op.getOperand(1);
26563 SDValue PassThru = Op.getOperand(2);
26564 SDValue Mask = Op.getOperand(3);
26565 SDValue Rnd = Op.getOperand(4);
26566
26567 unsigned Opc;
26568 if (isRoundModeCurDirection(Rnd))
26569 Opc = IntrData->Opc0;
26570 else if (isRoundModeSAE(Rnd))
26571 Opc = IntrData->Opc1;
26572 else
26573 return SDValue();
26574
26575 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26576 Subtarget, DAG);
26577 }
26578 case INTR_TYPE_SCALAR_MASK: {
26579 SDValue Src1 = Op.getOperand(1);
26580 SDValue Src2 = Op.getOperand(2);
26581 SDValue passThru = Op.getOperand(3);
26582 SDValue Mask = Op.getOperand(4);
26583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26584 // There are 2 kinds of intrinsics in this group:
26585 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26586 // (2) With rounding mode and sae - 7 operands.
26587 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26588 if (Op.getNumOperands() == (5U + HasRounding)) {
26589 if (HasRounding) {
26590 SDValue Rnd = Op.getOperand(5);
26591 unsigned RC = 0;
26592 if (isRoundModeSAEToX(Rnd, RC))
26593 return getScalarMaskingNode(
26594 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26595 DAG.getTargetConstant(RC, dl, MVT::i32)),
26596 Mask, passThru, Subtarget, DAG);
26597 if (!isRoundModeCurDirection(Rnd))
26598 return SDValue();
26599 }
26600 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26601 Src2),
26602 Mask, passThru, Subtarget, DAG);
26603 }
26604
26605 assert(Op.getNumOperands() == (6U + HasRounding) &&
26606 "Unexpected intrinsic form");
26607 SDValue RoundingMode = Op.getOperand(5);
26608 unsigned Opc = IntrData->Opc0;
26609 if (HasRounding) {
26610 SDValue Sae = Op.getOperand(6);
26611 if (isRoundModeSAE(Sae))
26612 Opc = IntrWithRoundingModeOpcode;
26613 else if (!isRoundModeCurDirection(Sae))
26614 return SDValue();
26615 }
26616 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26617 Src2, RoundingMode),
26618 Mask, passThru, Subtarget, DAG);
26619 }
26621 SDValue Src1 = Op.getOperand(1);
26622 SDValue Src2 = Op.getOperand(2);
26623 SDValue passThru = Op.getOperand(3);
26624 SDValue Mask = Op.getOperand(4);
26625 SDValue Rnd = Op.getOperand(5);
26626
26627 SDValue NewOp;
26628 unsigned RC = 0;
26629 if (isRoundModeCurDirection(Rnd))
26630 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26631 else if (isRoundModeSAEToX(Rnd, RC))
26632 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26633 DAG.getTargetConstant(RC, dl, MVT::i32));
26634 else
26635 return SDValue();
26636
26637 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26638 }
26640 SDValue Src1 = Op.getOperand(1);
26641 SDValue Src2 = Op.getOperand(2);
26642 SDValue passThru = Op.getOperand(3);
26643 SDValue Mask = Op.getOperand(4);
26644 SDValue Sae = Op.getOperand(5);
26645 unsigned Opc;
26646 if (isRoundModeCurDirection(Sae))
26647 Opc = IntrData->Opc0;
26648 else if (isRoundModeSAE(Sae))
26649 Opc = IntrData->Opc1;
26650 else
26651 return SDValue();
26652
26653 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26654 Mask, passThru, Subtarget, DAG);
26655 }
26656 case INTR_TYPE_2OP_MASK: {
26657 SDValue Src1 = Op.getOperand(1);
26658 SDValue Src2 = Op.getOperand(2);
26659 SDValue PassThru = Op.getOperand(3);
26660 SDValue Mask = Op.getOperand(4);
26661 SDValue NewOp;
26662 if (IntrData->Opc1 != 0) {
26663 SDValue Rnd = Op.getOperand(5);
26664 unsigned RC = 0;
26665 if (isRoundModeSAEToX(Rnd, RC))
26666 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26667 DAG.getTargetConstant(RC, dl, MVT::i32));
26668 else if (!isRoundModeCurDirection(Rnd))
26669 return SDValue();
26670 }
26671 if (!NewOp)
26672 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26673 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26674 }
26676 SDValue Src1 = Op.getOperand(1);
26677 SDValue Src2 = Op.getOperand(2);
26678 SDValue PassThru = Op.getOperand(3);
26679 SDValue Mask = Op.getOperand(4);
26680
26681 unsigned Opc = IntrData->Opc0;
26682 if (IntrData->Opc1 != 0) {
26683 SDValue Sae = Op.getOperand(5);
26684 if (isRoundModeSAE(Sae))
26685 Opc = IntrData->Opc1;
26686 else if (!isRoundModeCurDirection(Sae))
26687 return SDValue();
26688 }
26689
26690 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26691 Mask, PassThru, Subtarget, DAG);
26692 }
26694 SDValue Src1 = Op.getOperand(1);
26695 SDValue Src2 = Op.getOperand(2);
26696 SDValue Src3 = Op.getOperand(3);
26697 SDValue PassThru = Op.getOperand(4);
26698 SDValue Mask = Op.getOperand(5);
26699 SDValue Sae = Op.getOperand(6);
26700 unsigned Opc;
26701 if (isRoundModeCurDirection(Sae))
26702 Opc = IntrData->Opc0;
26703 else if (isRoundModeSAE(Sae))
26704 Opc = IntrData->Opc1;
26705 else
26706 return SDValue();
26707
26708 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26709 Mask, PassThru, Subtarget, DAG);
26710 }
26712 SDValue Src1 = Op.getOperand(1);
26713 SDValue Src2 = Op.getOperand(2);
26714 SDValue Src3 = Op.getOperand(3);
26715 SDValue PassThru = Op.getOperand(4);
26716 SDValue Mask = Op.getOperand(5);
26717
26718 unsigned Opc = IntrData->Opc0;
26719 if (IntrData->Opc1 != 0) {
26720 SDValue Sae = Op.getOperand(6);
26721 if (isRoundModeSAE(Sae))
26722 Opc = IntrData->Opc1;
26723 else if (!isRoundModeCurDirection(Sae))
26724 return SDValue();
26725 }
26726 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26727 Mask, PassThru, Subtarget, DAG);
26728 }
26729 case BLENDV: {
26730 SDValue Src1 = Op.getOperand(1);
26731 SDValue Src2 = Op.getOperand(2);
26732 SDValue Src3 = Op.getOperand(3);
26733
26734 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26735 Src3 = DAG.getBitcast(MaskVT, Src3);
26736
26737 // Reverse the operands to match VSELECT order.
26738 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26739 }
26740 case VPERM_2OP : {
26741 SDValue Src1 = Op.getOperand(1);
26742 SDValue Src2 = Op.getOperand(2);
26743
26744 // Swap Src1 and Src2 in the node creation
26745 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26746 }
26747 case CFMA_OP_MASKZ:
26748 case CFMA_OP_MASK: {
26749 SDValue Src1 = Op.getOperand(1);
26750 SDValue Src2 = Op.getOperand(2);
26751 SDValue Src3 = Op.getOperand(3);
26752 SDValue Mask = Op.getOperand(4);
26753 MVT VT = Op.getSimpleValueType();
26754
26755 SDValue PassThru = Src3;
26756 if (IntrData->Type == CFMA_OP_MASKZ)
26757 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26758
26759 // We add rounding mode to the Node when
26760 // - RC Opcode is specified and
26761 // - RC is not "current direction".
26762 SDValue NewOp;
26763 if (IntrData->Opc1 != 0) {
26764 SDValue Rnd = Op.getOperand(5);
26765 unsigned RC = 0;
26766 if (isRoundModeSAEToX(Rnd, RC))
26767 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26768 DAG.getTargetConstant(RC, dl, MVT::i32));
26769 else if (!isRoundModeCurDirection(Rnd))
26770 return SDValue();
26771 }
26772 if (!NewOp)
26773 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26774 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26775 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26776 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26778 }
26779 case IFMA_OP:
26780 // NOTE: We need to swizzle the operands to pass the multiply operands
26781 // first.
26782 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26783 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26784 case FPCLASSS: {
26785 SDValue Src1 = Op.getOperand(1);
26786 SDValue Imm = Op.getOperand(2);
26787 SDValue Mask = Op.getOperand(3);
26788 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26789 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26790 Subtarget, DAG);
26791 // Need to fill with zeros to ensure the bitcast will produce zeroes
26792 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26793 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26794 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26795 DAG.getVectorIdxConstant(0, dl));
26796 return DAG.getBitcast(MVT::i8, Ins);
26797 }
26798
26799 case CMP_MASK_CC: {
26800 MVT MaskVT = Op.getSimpleValueType();
26801 SDValue CC = Op.getOperand(3);
26802 SDValue Mask = Op.getOperand(4);
26803 // We specify 2 possible opcodes for intrinsics with rounding modes.
26804 // First, we check if the intrinsic may have non-default rounding mode,
26805 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26806 if (IntrData->Opc1 != 0) {
26807 SDValue Sae = Op.getOperand(5);
26808 if (isRoundModeSAE(Sae))
26809 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26810 Op.getOperand(2), CC, Mask, Sae);
26811 if (!isRoundModeCurDirection(Sae))
26812 return SDValue();
26813 }
26814 //default rounding mode
26815 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26816 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26817 }
26818 case CMP_MASK_SCALAR_CC: {
26819 SDValue Src1 = Op.getOperand(1);
26820 SDValue Src2 = Op.getOperand(2);
26821 SDValue CC = Op.getOperand(3);
26822 SDValue Mask = Op.getOperand(4);
26823
26824 SDValue Cmp;
26825 if (IntrData->Opc1 != 0) {
26826 SDValue Sae = Op.getOperand(5);
26827 if (isRoundModeSAE(Sae))
26828 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26829 else if (!isRoundModeCurDirection(Sae))
26830 return SDValue();
26831 }
26832 //default rounding mode
26833 if (!Cmp.getNode())
26834 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26835
26836 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26837 Subtarget, DAG);
26838 // Need to fill with zeros to ensure the bitcast will produce zeroes
26839 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26840 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26841 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26842 DAG.getVectorIdxConstant(0, dl));
26843 return DAG.getBitcast(MVT::i8, Ins);
26844 }
26845 case COMI: { // Comparison intrinsics
26846 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26847 SDValue LHS = Op.getOperand(1);
26848 SDValue RHS = Op.getOperand(2);
26849 // Some conditions require the operands to be swapped.
26850 if (CC == ISD::SETLT || CC == ISD::SETLE)
26851 std::swap(LHS, RHS);
26852
26853 // For AVX10.2, Support EQ and NE.
26854 bool HasAVX10_2_COMX =
26855 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26856
26857 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26858 // For BF type we need to fall back.
26859 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26860
26861 auto ComiOpCode = IntrData->Opc0;
26862 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26863
26864 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26865 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26866
26867 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26868
26869 SDValue SetCC;
26870 switch (CC) {
26871 case ISD::SETEQ: {
26872 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26873 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26874 break;
26875 // (ZF = 1 and PF = 0)
26876 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26877 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26878 break;
26879 }
26880 case ISD::SETNE: {
26881 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26882 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26883 break;
26884 // (ZF = 0 or PF = 1)
26885 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26886 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26887 break;
26888 }
26889 case ISD::SETGT: // (CF = 0 and ZF = 0)
26890 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26891 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26892 break;
26893 }
26894 case ISD::SETGE: // CF = 0
26895 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26896 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26897 break;
26898 default:
26899 llvm_unreachable("Unexpected illegal condition!");
26900 }
26901 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26902 }
26903 case COMI_RM: { // Comparison intrinsics with Sae
26904 SDValue LHS = Op.getOperand(1);
26905 SDValue RHS = Op.getOperand(2);
26906 unsigned CondVal = Op.getConstantOperandVal(3);
26907 SDValue Sae = Op.getOperand(4);
26908
26909 SDValue FCmp;
26910 if (isRoundModeCurDirection(Sae))
26911 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26912 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26913 else if (isRoundModeSAE(Sae))
26914 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26915 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26916 else
26917 return SDValue();
26918 // Need to fill with zeros to ensure the bitcast will produce zeroes
26919 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26920 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26921 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26922 DAG.getVectorIdxConstant(0, dl));
26923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26924 DAG.getBitcast(MVT::i16, Ins));
26925 }
26926 case VSHIFT: {
26927 SDValue SrcOp = Op.getOperand(1);
26928 SDValue ShAmt = Op.getOperand(2);
26929 assert(ShAmt.getValueType() == MVT::i32 &&
26930 "Unexpected VSHIFT amount type");
26931
26932 // Catch shift-by-constant.
26933 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26934 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26935 Op.getSimpleValueType(), SrcOp,
26936 CShAmt->getZExtValue(), DAG);
26937
26938 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26939 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26940 SrcOp, ShAmt, 0, Subtarget, DAG);
26941 }
26943 SDValue Mask = Op.getOperand(3);
26944 SDValue DataToCompress = Op.getOperand(1);
26945 SDValue PassThru = Op.getOperand(2);
26946 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26947 return Op.getOperand(1);
26948
26949 // Avoid false dependency.
26950 if (PassThru.isUndef())
26951 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26952
26953 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26954 Mask);
26955 }
26956 case FIXUPIMM:
26957 case FIXUPIMM_MASKZ: {
26958 SDValue Src1 = Op.getOperand(1);
26959 SDValue Src2 = Op.getOperand(2);
26960 SDValue Src3 = Op.getOperand(3);
26961 SDValue Imm = Op.getOperand(4);
26962 SDValue Mask = Op.getOperand(5);
26963 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26964 ? Src1
26965 : getZeroVector(VT, Subtarget, DAG, dl);
26966
26967 unsigned Opc = IntrData->Opc0;
26968 if (IntrData->Opc1 != 0) {
26969 SDValue Sae = Op.getOperand(6);
26970 if (isRoundModeSAE(Sae))
26971 Opc = IntrData->Opc1;
26972 else if (!isRoundModeCurDirection(Sae))
26973 return SDValue();
26974 }
26975
26976 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26977
26979 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26980
26981 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26982 }
26983 case ROUNDP: {
26984 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26985 // Clear the upper bits of the rounding immediate so that the legacy
26986 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26987 uint64_t Round = Op.getConstantOperandVal(2);
26988 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26989 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26990 Op.getOperand(1), RoundingMode);
26991 }
26992 case ROUNDS: {
26993 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26994 // Clear the upper bits of the rounding immediate so that the legacy
26995 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26996 uint64_t Round = Op.getConstantOperandVal(3);
26997 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26998 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26999 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27000 }
27001 case BEXTRI: {
27002 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27003
27004 uint64_t Imm = Op.getConstantOperandVal(2);
27005 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27006 Op.getValueType());
27007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27008 Op.getOperand(1), Control);
27009 }
27010 // ADC/SBB
27011 case ADX: {
27012 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27013 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27014
27015 SDValue Res;
27016 // If the carry in is zero, then we should just use ADD/SUB instead of
27017 // ADC/SBB.
27018 if (isNullConstant(Op.getOperand(1))) {
27019 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27020 Op.getOperand(3));
27021 } else {
27022 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27023 DAG.getAllOnesConstant(dl, MVT::i8));
27024 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27025 Op.getOperand(3), GenCF.getValue(1));
27026 }
27027 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27028 SDValue Results[] = { SetCC, Res };
27029 return DAG.getMergeValues(Results, dl);
27030 }
27031 case CVTPD2PS_MASK:
27032 case CVTPD2DQ_MASK:
27033 case CVTQQ2PS_MASK:
27034 case TRUNCATE_TO_REG: {
27035 SDValue Src = Op.getOperand(1);
27036 SDValue PassThru = Op.getOperand(2);
27037 SDValue Mask = Op.getOperand(3);
27038
27039 if (isAllOnesConstant(Mask))
27040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27041
27042 MVT SrcVT = Src.getSimpleValueType();
27043 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27044 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27045 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27046 {Src, PassThru, Mask});
27047 }
27048 case TRUNCATE2_TO_REG: {
27049 SDValue Src = Op.getOperand(1);
27050 SDValue Src2 = Op.getOperand(2);
27051 SDValue PassThru = Op.getOperand(3);
27052 SDValue Mask = Op.getOperand(4);
27053
27054 if (isAllOnesConstant(Mask))
27055 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27056
27057 MVT Src2VT = Src2.getSimpleValueType();
27058 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27059 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27060 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27061 {Src, Src2, PassThru, Mask});
27062 }
27063 case CVTPS2PH_MASK: {
27064 SDValue Src = Op.getOperand(1);
27065 SDValue Rnd = Op.getOperand(2);
27066 SDValue PassThru = Op.getOperand(3);
27067 SDValue Mask = Op.getOperand(4);
27068
27069 unsigned RC = 0;
27070 unsigned Opc = IntrData->Opc0;
27071 bool SAE = Src.getValueType().is512BitVector() &&
27072 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27073 if (SAE) {
27075 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27076 }
27077
27078 if (isAllOnesConstant(Mask))
27079 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27080
27081 if (SAE)
27083 else
27084 Opc = IntrData->Opc1;
27085 MVT SrcVT = Src.getSimpleValueType();
27086 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27087 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27088 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27089 }
27090 case CVTNEPS2BF16_MASK: {
27091 SDValue Src = Op.getOperand(1);
27092 SDValue PassThru = Op.getOperand(2);
27093 SDValue Mask = Op.getOperand(3);
27094
27095 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27096 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27097
27098 // Break false dependency.
27099 if (PassThru.isUndef())
27100 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27101
27102 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27103 Mask);
27104 }
27105 default:
27106 break;
27107 }
27108 }
27109
27110 switch (IntNo) {
27111 default: return SDValue(); // Don't custom lower most intrinsics.
27112
27113 // ptest and testp intrinsics. The intrinsic these come from are designed to
27114 // return an integer value, not just an instruction so lower it to the ptest
27115 // or testp pattern and a setcc for the result.
27116 case Intrinsic::x86_avx512_ktestc_b:
27117 case Intrinsic::x86_avx512_ktestc_w:
27118 case Intrinsic::x86_avx512_ktestc_d:
27119 case Intrinsic::x86_avx512_ktestc_q:
27120 case Intrinsic::x86_avx512_ktestz_b:
27121 case Intrinsic::x86_avx512_ktestz_w:
27122 case Intrinsic::x86_avx512_ktestz_d:
27123 case Intrinsic::x86_avx512_ktestz_q:
27124 case Intrinsic::x86_sse41_ptestz:
27125 case Intrinsic::x86_sse41_ptestc:
27126 case Intrinsic::x86_sse41_ptestnzc:
27127 case Intrinsic::x86_avx_ptestz_256:
27128 case Intrinsic::x86_avx_ptestc_256:
27129 case Intrinsic::x86_avx_ptestnzc_256:
27130 case Intrinsic::x86_avx_vtestz_ps:
27131 case Intrinsic::x86_avx_vtestc_ps:
27132 case Intrinsic::x86_avx_vtestnzc_ps:
27133 case Intrinsic::x86_avx_vtestz_pd:
27134 case Intrinsic::x86_avx_vtestc_pd:
27135 case Intrinsic::x86_avx_vtestnzc_pd:
27136 case Intrinsic::x86_avx_vtestz_ps_256:
27137 case Intrinsic::x86_avx_vtestc_ps_256:
27138 case Intrinsic::x86_avx_vtestnzc_ps_256:
27139 case Intrinsic::x86_avx_vtestz_pd_256:
27140 case Intrinsic::x86_avx_vtestc_pd_256:
27141 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27142 unsigned TestOpc = X86ISD::PTEST;
27143 X86::CondCode X86CC;
27144 switch (IntNo) {
27145 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27146 case Intrinsic::x86_avx512_ktestc_b:
27147 case Intrinsic::x86_avx512_ktestc_w:
27148 case Intrinsic::x86_avx512_ktestc_d:
27149 case Intrinsic::x86_avx512_ktestc_q:
27150 // CF = 1
27151 TestOpc = X86ISD::KTEST;
27152 X86CC = X86::COND_B;
27153 break;
27154 case Intrinsic::x86_avx512_ktestz_b:
27155 case Intrinsic::x86_avx512_ktestz_w:
27156 case Intrinsic::x86_avx512_ktestz_d:
27157 case Intrinsic::x86_avx512_ktestz_q:
27158 TestOpc = X86ISD::KTEST;
27159 X86CC = X86::COND_E;
27160 break;
27161 case Intrinsic::x86_avx_vtestz_ps:
27162 case Intrinsic::x86_avx_vtestz_pd:
27163 case Intrinsic::x86_avx_vtestz_ps_256:
27164 case Intrinsic::x86_avx_vtestz_pd_256:
27165 TestOpc = X86ISD::TESTP;
27166 [[fallthrough]];
27167 case Intrinsic::x86_sse41_ptestz:
27168 case Intrinsic::x86_avx_ptestz_256:
27169 // ZF = 1
27170 X86CC = X86::COND_E;
27171 break;
27172 case Intrinsic::x86_avx_vtestc_ps:
27173 case Intrinsic::x86_avx_vtestc_pd:
27174 case Intrinsic::x86_avx_vtestc_ps_256:
27175 case Intrinsic::x86_avx_vtestc_pd_256:
27176 TestOpc = X86ISD::TESTP;
27177 [[fallthrough]];
27178 case Intrinsic::x86_sse41_ptestc:
27179 case Intrinsic::x86_avx_ptestc_256:
27180 // CF = 1
27181 X86CC = X86::COND_B;
27182 break;
27183 case Intrinsic::x86_avx_vtestnzc_ps:
27184 case Intrinsic::x86_avx_vtestnzc_pd:
27185 case Intrinsic::x86_avx_vtestnzc_ps_256:
27186 case Intrinsic::x86_avx_vtestnzc_pd_256:
27187 TestOpc = X86ISD::TESTP;
27188 [[fallthrough]];
27189 case Intrinsic::x86_sse41_ptestnzc:
27190 case Intrinsic::x86_avx_ptestnzc_256:
27191 // ZF and CF = 0
27192 X86CC = X86::COND_A;
27193 break;
27194 }
27195
27196 SDValue LHS = Op.getOperand(1);
27197 SDValue RHS = Op.getOperand(2);
27198 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27199 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27200 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27201 }
27202
27203 case Intrinsic::x86_sse42_pcmpistria128:
27204 case Intrinsic::x86_sse42_pcmpestria128:
27205 case Intrinsic::x86_sse42_pcmpistric128:
27206 case Intrinsic::x86_sse42_pcmpestric128:
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 case Intrinsic::x86_sse42_pcmpestrio128:
27209 case Intrinsic::x86_sse42_pcmpistris128:
27210 case Intrinsic::x86_sse42_pcmpestris128:
27211 case Intrinsic::x86_sse42_pcmpistriz128:
27212 case Intrinsic::x86_sse42_pcmpestriz128: {
27213 unsigned Opcode;
27214 X86::CondCode X86CC;
27215 switch (IntNo) {
27216 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27217 case Intrinsic::x86_sse42_pcmpistria128:
27218 Opcode = X86ISD::PCMPISTR;
27219 X86CC = X86::COND_A;
27220 break;
27221 case Intrinsic::x86_sse42_pcmpestria128:
27222 Opcode = X86ISD::PCMPESTR;
27223 X86CC = X86::COND_A;
27224 break;
27225 case Intrinsic::x86_sse42_pcmpistric128:
27226 Opcode = X86ISD::PCMPISTR;
27227 X86CC = X86::COND_B;
27228 break;
27229 case Intrinsic::x86_sse42_pcmpestric128:
27230 Opcode = X86ISD::PCMPESTR;
27231 X86CC = X86::COND_B;
27232 break;
27233 case Intrinsic::x86_sse42_pcmpistrio128:
27234 Opcode = X86ISD::PCMPISTR;
27235 X86CC = X86::COND_O;
27236 break;
27237 case Intrinsic::x86_sse42_pcmpestrio128:
27238 Opcode = X86ISD::PCMPESTR;
27239 X86CC = X86::COND_O;
27240 break;
27241 case Intrinsic::x86_sse42_pcmpistris128:
27242 Opcode = X86ISD::PCMPISTR;
27243 X86CC = X86::COND_S;
27244 break;
27245 case Intrinsic::x86_sse42_pcmpestris128:
27246 Opcode = X86ISD::PCMPESTR;
27247 X86CC = X86::COND_S;
27248 break;
27249 case Intrinsic::x86_sse42_pcmpistriz128:
27250 Opcode = X86ISD::PCMPISTR;
27251 X86CC = X86::COND_E;
27252 break;
27253 case Intrinsic::x86_sse42_pcmpestriz128:
27254 Opcode = X86ISD::PCMPESTR;
27255 X86CC = X86::COND_E;
27256 break;
27257 }
27259 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27260 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27261 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27262 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27263 }
27264
27265 case Intrinsic::x86_sse42_pcmpistri128:
27266 case Intrinsic::x86_sse42_pcmpestri128: {
27267 unsigned Opcode;
27268 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27269 Opcode = X86ISD::PCMPISTR;
27270 else
27271 Opcode = X86ISD::PCMPESTR;
27272
27274 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27275 return DAG.getNode(Opcode, dl, VTs, NewOps);
27276 }
27277
27278 case Intrinsic::x86_sse42_pcmpistrm128:
27279 case Intrinsic::x86_sse42_pcmpestrm128: {
27280 unsigned Opcode;
27281 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27282 Opcode = X86ISD::PCMPISTR;
27283 else
27284 Opcode = X86ISD::PCMPESTR;
27285
27287 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27288 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27289 }
27290
27291 case Intrinsic::eh_sjlj_lsda: {
27292 MachineFunction &MF = DAG.getMachineFunction();
27293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27294 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27295 auto &Context = MF.getContext();
27296 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27297 Twine(MF.getFunctionNumber()));
27298 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27299 DAG.getMCSymbol(S, PtrVT));
27300 }
27301
27302 case Intrinsic::x86_seh_lsda: {
27303 // Compute the symbol for the LSDA. We know it'll get emitted later.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 SDValue Op1 = Op.getOperand(1);
27306 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27309
27310 // Generate a simple absolute symbol reference. This intrinsic is only
27311 // supported on 32-bit Windows, which isn't PIC.
27312 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27313 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27314 }
27315
27316 case Intrinsic::eh_recoverfp: {
27317 SDValue FnOp = Op.getOperand(1);
27318 SDValue IncomingFPOp = Op.getOperand(2);
27319 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27320 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27321 if (!Fn)
27323 "llvm.eh.recoverfp must take a function as the first argument");
27324 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27325 }
27326
27327 case Intrinsic::localaddress: {
27328 // Returns one of the stack, base, or frame pointer registers, depending on
27329 // which is used to reference local variables.
27330 MachineFunction &MF = DAG.getMachineFunction();
27331 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27332 Register Reg;
27333 if (RegInfo->hasBasePointer(MF))
27334 Reg = RegInfo->getBaseRegister();
27335 else { // Handles the SP or FP case.
27336 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27337 if (CantUseFP)
27338 Reg = RegInfo->getPtrSizedStackRegister(MF);
27339 else
27340 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27341 }
27342 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27343 }
27344 case Intrinsic::x86_avx512_vp2intersect_q_512:
27345 case Intrinsic::x86_avx512_vp2intersect_q_256:
27346 case Intrinsic::x86_avx512_vp2intersect_q_128:
27347 case Intrinsic::x86_avx512_vp2intersect_d_512:
27348 case Intrinsic::x86_avx512_vp2intersect_d_256:
27349 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27350 SDLoc DL(Op);
27351 MVT MaskVT = Op.getSimpleValueType();
27352 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27354 Op.getOperand(1), Op.getOperand(2));
27355 SDValue Result0 =
27356 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27357 SDValue Result1 =
27358 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27359 return DAG.getMergeValues({Result0, Result1}, DL);
27360 }
27361 case Intrinsic::x86_mmx_pslli_w:
27362 case Intrinsic::x86_mmx_pslli_d:
27363 case Intrinsic::x86_mmx_pslli_q:
27364 case Intrinsic::x86_mmx_psrli_w:
27365 case Intrinsic::x86_mmx_psrli_d:
27366 case Intrinsic::x86_mmx_psrli_q:
27367 case Intrinsic::x86_mmx_psrai_w:
27368 case Intrinsic::x86_mmx_psrai_d: {
27369 SDLoc DL(Op);
27370 SDValue ShAmt = Op.getOperand(2);
27371 // If the argument is a constant, convert it to a target constant.
27372 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27373 // Clamp out of bounds shift amounts since they will otherwise be masked
27374 // to 8-bits which may make it no longer out of bounds.
27375 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27376 if (ShiftAmount == 0)
27377 return Op.getOperand(1);
27378
27379 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27380 Op.getOperand(0), Op.getOperand(1),
27381 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27382 }
27383
27384 unsigned NewIntrinsic;
27385 switch (IntNo) {
27386 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27387 case Intrinsic::x86_mmx_pslli_w:
27388 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27389 break;
27390 case Intrinsic::x86_mmx_pslli_d:
27391 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27392 break;
27393 case Intrinsic::x86_mmx_pslli_q:
27394 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27395 break;
27396 case Intrinsic::x86_mmx_psrli_w:
27397 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27398 break;
27399 case Intrinsic::x86_mmx_psrli_d:
27400 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27401 break;
27402 case Intrinsic::x86_mmx_psrli_q:
27403 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27404 break;
27405 case Intrinsic::x86_mmx_psrai_w:
27406 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27407 break;
27408 case Intrinsic::x86_mmx_psrai_d:
27409 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27410 break;
27411 }
27412
27413 // The vector shift intrinsics with scalars uses 32b shift amounts but
27414 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27415 // MMX register.
27416 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27417 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27418 DAG.getTargetConstant(NewIntrinsic, DL,
27420 Op.getOperand(1), ShAmt);
27421 }
27422 case Intrinsic::thread_pointer: {
27423 if (Subtarget.isTargetELF()) {
27424 SDLoc dl(Op);
27425 EVT PtrVT = Op.getValueType();
27426 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27428 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27429 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27430 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27431 }
27433 "Target OS doesn't support __builtin_thread_pointer() yet.");
27434 }
27435 }
27436}
27437
27439 SDValue Src, SDValue Mask, SDValue Base,
27440 SDValue Index, SDValue ScaleOp, SDValue Chain,
27441 const X86Subtarget &Subtarget) {
27442 SDLoc dl(Op);
27443 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27444 // Scale must be constant.
27445 if (!C)
27446 return SDValue();
27447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27448 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27449 TLI.getPointerTy(DAG.getDataLayout()));
27450 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27452 // If source is undef or we know it won't be used, use a zero vector
27453 // to break register dependency.
27454 // TODO: use undef instead and let BreakFalseDeps deal with it?
27455 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27456 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27457
27458 // Cast mask to an integer type.
27459 Mask = DAG.getBitcast(MaskVT, Mask);
27460
27462
27463 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27464 SDValue Res =
27466 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27467 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27468}
27469
27471 SDValue Src, SDValue Mask, SDValue Base,
27472 SDValue Index, SDValue ScaleOp, SDValue Chain,
27473 const X86Subtarget &Subtarget) {
27474 MVT VT = Op.getSimpleValueType();
27475 SDLoc dl(Op);
27476 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27477 // Scale must be constant.
27478 if (!C)
27479 return SDValue();
27480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27481 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27482 TLI.getPointerTy(DAG.getDataLayout()));
27483 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27485 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27486
27487 // We support two versions of the gather intrinsics. One with scalar mask and
27488 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27489 if (Mask.getValueType() != MaskVT)
27490 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27491
27492 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27493 // If source is undef or we know it won't be used, use a zero vector
27494 // to break register dependency.
27495 // TODO: use undef instead and let BreakFalseDeps deal with it?
27496 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27497 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27498
27500
27501 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27502 SDValue Res =
27504 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27505 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27506}
27507
27509 SDValue Src, SDValue Mask, SDValue Base,
27510 SDValue Index, SDValue ScaleOp, SDValue Chain,
27511 const X86Subtarget &Subtarget) {
27512 SDLoc dl(Op);
27513 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27514 // Scale must be constant.
27515 if (!C)
27516 return SDValue();
27517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27518 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27519 TLI.getPointerTy(DAG.getDataLayout()));
27520 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27521 Src.getSimpleValueType().getVectorNumElements());
27522 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27523
27524 // We support two versions of the scatter intrinsics. One with scalar mask and
27525 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27526 if (Mask.getValueType() != MaskVT)
27527 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27528
27530
27531 SDVTList VTs = DAG.getVTList(MVT::Other);
27532 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27533 SDValue Res =
27535 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27536 return Res;
27537}
27538
27540 SDValue Mask, SDValue Base, SDValue Index,
27541 SDValue ScaleOp, SDValue Chain,
27542 const X86Subtarget &Subtarget) {
27543 SDLoc dl(Op);
27544 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27545 // Scale must be constant.
27546 if (!C)
27547 return SDValue();
27548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27549 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27550 TLI.getPointerTy(DAG.getDataLayout()));
27551 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27552 SDValue Segment = DAG.getRegister(0, MVT::i32);
27553 MVT MaskVT =
27554 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27555 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27556 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27557 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27558 return SDValue(Res, 0);
27559}
27560
27561/// Handles the lowering of builtin intrinsics with chain that return their
27562/// value into registers EDX:EAX.
27563/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27564/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27565/// TargetOpcode.
27566/// Returns a Glue value which can be used to add extra copy-from-reg if the
27567/// expanded intrinsics implicitly defines extra registers (i.e. not just
27568/// EDX:EAX).
27570 SelectionDAG &DAG,
27571 unsigned TargetOpcode,
27572 unsigned SrcReg,
27573 const X86Subtarget &Subtarget,
27575 SDValue Chain = N->getOperand(0);
27576 SDValue Glue;
27577
27578 if (SrcReg) {
27579 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27580 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27581 Glue = Chain.getValue(1);
27582 }
27583
27584 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27585 SDValue N1Ops[] = {Chain, Glue};
27586 SDNode *N1 = DAG.getMachineNode(
27587 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27588 Chain = SDValue(N1, 0);
27589
27590 // Reads the content of XCR and returns it in registers EDX:EAX.
27591 SDValue LO, HI;
27592 if (Subtarget.is64Bit()) {
27593 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27594 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27595 LO.getValue(2));
27596 } else {
27597 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27598 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27599 LO.getValue(2));
27600 }
27601 Chain = HI.getValue(1);
27602 Glue = HI.getValue(2);
27603
27604 if (Subtarget.is64Bit()) {
27605 // Merge the two 32-bit values into a 64-bit one.
27606 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27607 DAG.getConstant(32, DL, MVT::i8));
27608 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27609 Results.push_back(Chain);
27610 return Glue;
27611 }
27612
27613 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27614 SDValue Ops[] = { LO, HI };
27615 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27616 Results.push_back(Pair);
27617 Results.push_back(Chain);
27618 return Glue;
27619}
27620
27621/// Handles the lowering of builtin intrinsics that read the time stamp counter
27622/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27623/// READCYCLECOUNTER nodes.
27624static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27625 SelectionDAG &DAG,
27626 const X86Subtarget &Subtarget,
27628 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27629 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27630 // and the EAX register is loaded with the low-order 32 bits.
27631 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27632 /* NoRegister */0, Subtarget,
27633 Results);
27634 if (Opcode != X86::RDTSCP)
27635 return;
27636
27637 SDValue Chain = Results[1];
27638 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27639 // the ECX register. Add 'ecx' explicitly to the chain.
27640 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27641 Results[1] = ecx;
27642 Results.push_back(ecx.getValue(1));
27643}
27644
27646 SelectionDAG &DAG) {
27648 SDLoc DL(Op);
27649 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27650 Results);
27651 return DAG.getMergeValues(Results, DL);
27652}
27653
27656 SDValue Chain = Op.getOperand(0);
27657 SDValue RegNode = Op.getOperand(2);
27658 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27659 if (!EHInfo)
27660 report_fatal_error("EH registrations only live in functions using WinEH");
27661
27662 // Cast the operand to an alloca, and remember the frame index.
27663 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27664 if (!FINode)
27665 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27666 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27667
27668 // Return the chain operand without making any DAG nodes.
27669 return Chain;
27670}
27671
27674 SDValue Chain = Op.getOperand(0);
27675 SDValue EHGuard = Op.getOperand(2);
27676 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27677 if (!EHInfo)
27678 report_fatal_error("EHGuard only live in functions using WinEH");
27679
27680 // Cast the operand to an alloca, and remember the frame index.
27681 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27682 if (!FINode)
27683 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27684 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27685
27686 // Return the chain operand without making any DAG nodes.
27687 return Chain;
27688}
27689
27690/// Emit Truncating Store with signed or unsigned saturation.
27691static SDValue
27692EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27693 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27694 SelectionDAG &DAG) {
27695 SDVTList VTs = DAG.getVTList(MVT::Other);
27696 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27697 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27698 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27699 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27700}
27701
27702/// Emit Masked Truncating Store with signed or unsigned saturation.
27703static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27704 const SDLoc &DL,
27705 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27706 MachineMemOperand *MMO, SelectionDAG &DAG) {
27707 SDVTList VTs = DAG.getVTList(MVT::Other);
27708 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27709 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27710 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27711}
27712
27714 const MachineFunction &MF) {
27715 if (!Subtarget.is64Bit())
27716 return false;
27717 // 64-bit targets support extended Swift async frame setup,
27718 // except for targets that use the windows 64 prologue.
27719 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27720}
27721
27723 SelectionDAG &DAG) {
27724 unsigned IntNo = Op.getConstantOperandVal(1);
27725 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27726 if (!IntrData) {
27727 switch (IntNo) {
27728
27729 case Intrinsic::swift_async_context_addr: {
27730 SDLoc dl(Op);
27731 auto &MF = DAG.getMachineFunction();
27732 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27733 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27735 X86FI->setHasSwiftAsyncContext(true);
27736 SDValue Chain = Op->getOperand(0);
27737 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27738 SDValue Result =
27739 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27740 DAG.getTargetConstant(8, dl, MVT::i32)),
27741 0);
27742 // Return { result, chain }.
27743 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27744 CopyRBP.getValue(1));
27745 } else {
27746 // No special extended frame, create or reuse an existing stack slot.
27747 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27748 if (!X86FI->getSwiftAsyncContextFrameIdx())
27749 X86FI->setSwiftAsyncContextFrameIdx(
27750 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27751 false));
27752 SDValue Result =
27753 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27754 PtrSize == 8 ? MVT::i64 : MVT::i32);
27755 // Return { result, chain }.
27756 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27757 Op->getOperand(0));
27758 }
27759 }
27760
27761 case llvm::Intrinsic::x86_seh_ehregnode:
27762 return MarkEHRegistrationNode(Op, DAG);
27763 case llvm::Intrinsic::x86_seh_ehguard:
27764 return MarkEHGuard(Op, DAG);
27765 case llvm::Intrinsic::x86_rdpkru: {
27766 SDLoc dl(Op);
27767 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27768 // Create a RDPKRU node and pass 0 to the ECX parameter.
27769 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27770 DAG.getConstant(0, dl, MVT::i32));
27771 }
27772 case llvm::Intrinsic::x86_wrpkru: {
27773 SDLoc dl(Op);
27774 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27775 // to the EDX and ECX parameters.
27776 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27777 Op.getOperand(0), Op.getOperand(2),
27778 DAG.getConstant(0, dl, MVT::i32),
27779 DAG.getConstant(0, dl, MVT::i32));
27780 }
27781 case llvm::Intrinsic::asan_check_memaccess: {
27782 // Mark this as adjustsStack because it will be lowered to a call.
27784 // Don't do anything here, we will expand these intrinsics out later.
27785 return Op;
27786 }
27787 case llvm::Intrinsic::x86_flags_read_u32:
27788 case llvm::Intrinsic::x86_flags_read_u64:
27789 case llvm::Intrinsic::x86_flags_write_u32:
27790 case llvm::Intrinsic::x86_flags_write_u64: {
27791 // We need a frame pointer because this will get lowered to a PUSH/POP
27792 // sequence.
27795 // Don't do anything here, we will expand these intrinsics out later
27796 // during FinalizeISel in EmitInstrWithCustomInserter.
27797 return Op;
27798 }
27799 case Intrinsic::x86_lwpins32:
27800 case Intrinsic::x86_lwpins64:
27801 case Intrinsic::x86_umwait:
27802 case Intrinsic::x86_tpause: {
27803 SDLoc dl(Op);
27804 SDValue Chain = Op->getOperand(0);
27805 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27806 unsigned Opcode;
27807
27808 switch (IntNo) {
27809 default: llvm_unreachable("Impossible intrinsic");
27810 case Intrinsic::x86_umwait:
27811 Opcode = X86ISD::UMWAIT;
27812 break;
27813 case Intrinsic::x86_tpause:
27814 Opcode = X86ISD::TPAUSE;
27815 break;
27816 case Intrinsic::x86_lwpins32:
27817 case Intrinsic::x86_lwpins64:
27818 Opcode = X86ISD::LWPINS;
27819 break;
27820 }
27821
27823 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27824 Op->getOperand(3), Op->getOperand(4));
27825 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27826 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27827 Operation.getValue(1));
27828 }
27829 case Intrinsic::x86_enqcmd:
27830 case Intrinsic::x86_enqcmds: {
27831 SDLoc dl(Op);
27832 SDValue Chain = Op.getOperand(0);
27833 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27834 unsigned Opcode;
27835 switch (IntNo) {
27836 default: llvm_unreachable("Impossible intrinsic!");
27837 case Intrinsic::x86_enqcmd:
27838 Opcode = X86ISD::ENQCMD;
27839 break;
27840 case Intrinsic::x86_enqcmds:
27841 Opcode = X86ISD::ENQCMDS;
27842 break;
27843 }
27844 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27845 Op.getOperand(3));
27846 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27847 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27848 Operation.getValue(1));
27849 }
27850 case Intrinsic::x86_aesenc128kl:
27851 case Intrinsic::x86_aesdec128kl:
27852 case Intrinsic::x86_aesenc256kl:
27853 case Intrinsic::x86_aesdec256kl: {
27854 SDLoc DL(Op);
27855 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27856 SDValue Chain = Op.getOperand(0);
27857 unsigned Opcode;
27858
27859 switch (IntNo) {
27860 default: llvm_unreachable("Impossible intrinsic");
27861 case Intrinsic::x86_aesenc128kl:
27862 Opcode = X86ISD::AESENC128KL;
27863 break;
27864 case Intrinsic::x86_aesdec128kl:
27865 Opcode = X86ISD::AESDEC128KL;
27866 break;
27867 case Intrinsic::x86_aesenc256kl:
27868 Opcode = X86ISD::AESENC256KL;
27869 break;
27870 case Intrinsic::x86_aesdec256kl:
27871 Opcode = X86ISD::AESDEC256KL;
27872 break;
27873 }
27874
27876 MachineMemOperand *MMO = MemIntr->getMemOperand();
27877 EVT MemVT = MemIntr->getMemoryVT();
27879 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27880 MMO);
27881 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27882
27883 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27884 {ZF, Operation.getValue(0), Operation.getValue(2)});
27885 }
27886 case Intrinsic::x86_aesencwide128kl:
27887 case Intrinsic::x86_aesdecwide128kl:
27888 case Intrinsic::x86_aesencwide256kl:
27889 case Intrinsic::x86_aesdecwide256kl: {
27890 SDLoc DL(Op);
27891 SDVTList VTs = DAG.getVTList(
27892 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27893 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27894 SDValue Chain = Op.getOperand(0);
27895 unsigned Opcode;
27896
27897 switch (IntNo) {
27898 default: llvm_unreachable("Impossible intrinsic");
27899 case Intrinsic::x86_aesencwide128kl:
27900 Opcode = X86ISD::AESENCWIDE128KL;
27901 break;
27902 case Intrinsic::x86_aesdecwide128kl:
27903 Opcode = X86ISD::AESDECWIDE128KL;
27904 break;
27905 case Intrinsic::x86_aesencwide256kl:
27906 Opcode = X86ISD::AESENCWIDE256KL;
27907 break;
27908 case Intrinsic::x86_aesdecwide256kl:
27909 Opcode = X86ISD::AESDECWIDE256KL;
27910 break;
27911 }
27912
27914 MachineMemOperand *MMO = MemIntr->getMemOperand();
27915 EVT MemVT = MemIntr->getMemoryVT();
27917 Opcode, DL, VTs,
27918 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27919 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27920 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27921 MemVT, MMO);
27922 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27923
27924 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27925 {ZF, Operation.getValue(1), Operation.getValue(2),
27926 Operation.getValue(3), Operation.getValue(4),
27927 Operation.getValue(5), Operation.getValue(6),
27928 Operation.getValue(7), Operation.getValue(8),
27929 Operation.getValue(9)});
27930 }
27931 case Intrinsic::x86_testui: {
27932 SDLoc dl(Op);
27933 SDValue Chain = Op.getOperand(0);
27934 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27935 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27936 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27937 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27938 Operation.getValue(1));
27939 }
27940 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27941 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27943 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27944 case Intrinsic::x86_t2rpntlvwz0_internal:
27945 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1_internal:
27947 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27948 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27950 unsigned IntNo = Op.getConstantOperandVal(1);
27951 unsigned Opc = 0;
27952 switch (IntNo) {
27953 default:
27954 llvm_unreachable("Unexpected intrinsic!");
27955 case Intrinsic::x86_t2rpntlvwz0_internal:
27956 Opc = X86::PT2RPNTLVWZ0V;
27957 break;
27958 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27959 Opc = X86::PT2RPNTLVWZ0T1V;
27960 break;
27961 case Intrinsic::x86_t2rpntlvwz1_internal:
27962 Opc = X86::PT2RPNTLVWZ1V;
27963 break;
27964 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27965 Opc = X86::PT2RPNTLVWZ1T1V;
27966 break;
27967 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27968 Opc = X86::PT2RPNTLVWZ0RSV;
27969 break;
27970 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27971 Opc = X86::PT2RPNTLVWZ0RST1V;
27972 break;
27973 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27974 Opc = X86::PT2RPNTLVWZ1RSV;
27975 break;
27976 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27977 Opc = X86::PT2RPNTLVWZ1RST1V;
27978 break;
27979 }
27980
27981 SDLoc DL(Op);
27982 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27983
27984 SDValue Ops[] = {Op.getOperand(2), // Row
27985 Op.getOperand(3), // Col0
27986 Op.getOperand(4), // Col1
27987 Op.getOperand(5), // Base
27988 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27989 Op.getOperand(6), // Index
27990 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27991 DAG.getRegister(0, MVT::i16), // Segment
27992 Op.getOperand(0)}; // Chain
27993
27994 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27995 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27996 SDValue(Res, 0));
27997 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27998 SDValue(Res, 0));
27999 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28000 }
28001 case Intrinsic::x86_atomic_bts_rm:
28002 case Intrinsic::x86_atomic_btc_rm:
28003 case Intrinsic::x86_atomic_btr_rm: {
28004 SDLoc DL(Op);
28005 MVT VT = Op.getSimpleValueType();
28006 SDValue Chain = Op.getOperand(0);
28007 SDValue Op1 = Op.getOperand(2);
28008 SDValue Op2 = Op.getOperand(3);
28009 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28010 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28012 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28013 SDValue Res =
28014 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28015 {Chain, Op1, Op2}, VT, MMO);
28016 Chain = Res.getValue(1);
28017 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28018 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28019 }
28020 case Intrinsic::x86_atomic_bts:
28021 case Intrinsic::x86_atomic_btc:
28022 case Intrinsic::x86_atomic_btr: {
28023 SDLoc DL(Op);
28024 MVT VT = Op.getSimpleValueType();
28025 SDValue Chain = Op.getOperand(0);
28026 SDValue Op1 = Op.getOperand(2);
28027 SDValue Op2 = Op.getOperand(3);
28028 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28029 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28030 : X86ISD::LBTR;
28031 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28032 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28033 SDValue Res =
28034 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28035 {Chain, Op1, Op2, Size}, VT, MMO);
28036 Chain = Res.getValue(1);
28037 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28038 unsigned Imm = Op2->getAsZExtVal();
28039 if (Imm)
28040 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28041 DAG.getShiftAmountConstant(Imm, VT, DL));
28042 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28043 }
28044 case Intrinsic::x86_cmpccxadd32:
28045 case Intrinsic::x86_cmpccxadd64: {
28046 SDLoc DL(Op);
28047 SDValue Chain = Op.getOperand(0);
28048 SDValue Addr = Op.getOperand(2);
28049 SDValue Src1 = Op.getOperand(3);
28050 SDValue Src2 = Op.getOperand(4);
28051 SDValue CC = Op.getOperand(5);
28052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28054 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28055 MVT::i32, MMO);
28056 return Operation;
28057 }
28058 case Intrinsic::x86_aadd32:
28059 case Intrinsic::x86_aadd64:
28060 case Intrinsic::x86_aand32:
28061 case Intrinsic::x86_aand64:
28062 case Intrinsic::x86_aor32:
28063 case Intrinsic::x86_aor64:
28064 case Intrinsic::x86_axor32:
28065 case Intrinsic::x86_axor64: {
28066 SDLoc DL(Op);
28067 SDValue Chain = Op.getOperand(0);
28068 SDValue Op1 = Op.getOperand(2);
28069 SDValue Op2 = Op.getOperand(3);
28070 MVT VT = Op2.getSimpleValueType();
28071 unsigned Opc = 0;
28072 switch (IntNo) {
28073 default:
28074 llvm_unreachable("Unknown Intrinsic");
28075 case Intrinsic::x86_aadd32:
28076 case Intrinsic::x86_aadd64:
28077 Opc = X86ISD::AADD;
28078 break;
28079 case Intrinsic::x86_aand32:
28080 case Intrinsic::x86_aand64:
28081 Opc = X86ISD::AAND;
28082 break;
28083 case Intrinsic::x86_aor32:
28084 case Intrinsic::x86_aor64:
28085 Opc = X86ISD::AOR;
28086 break;
28087 case Intrinsic::x86_axor32:
28088 case Intrinsic::x86_axor64:
28089 Opc = X86ISD::AXOR;
28090 break;
28091 }
28092 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28094 {Chain, Op1, Op2}, VT, MMO);
28095 }
28096 case Intrinsic::x86_atomic_add_cc:
28097 case Intrinsic::x86_atomic_sub_cc:
28098 case Intrinsic::x86_atomic_or_cc:
28099 case Intrinsic::x86_atomic_and_cc:
28100 case Intrinsic::x86_atomic_xor_cc: {
28101 SDLoc DL(Op);
28102 SDValue Chain = Op.getOperand(0);
28103 SDValue Op1 = Op.getOperand(2);
28104 SDValue Op2 = Op.getOperand(3);
28105 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28106 MVT VT = Op2.getSimpleValueType();
28107 unsigned Opc = 0;
28108 switch (IntNo) {
28109 default:
28110 llvm_unreachable("Unknown Intrinsic");
28111 case Intrinsic::x86_atomic_add_cc:
28112 Opc = X86ISD::LADD;
28113 break;
28114 case Intrinsic::x86_atomic_sub_cc:
28115 Opc = X86ISD::LSUB;
28116 break;
28117 case Intrinsic::x86_atomic_or_cc:
28118 Opc = X86ISD::LOR;
28119 break;
28120 case Intrinsic::x86_atomic_and_cc:
28121 Opc = X86ISD::LAND;
28122 break;
28123 case Intrinsic::x86_atomic_xor_cc:
28124 Opc = X86ISD::LXOR;
28125 break;
28126 }
28127 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28128 SDValue LockArith =
28129 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28130 {Chain, Op1, Op2}, VT, MMO);
28131 Chain = LockArith.getValue(1);
28132 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28133 }
28134 }
28135 return SDValue();
28136 }
28137
28138 SDLoc dl(Op);
28139 switch(IntrData->Type) {
28140 default: llvm_unreachable("Unknown Intrinsic Type");
28141 case RDSEED:
28142 case RDRAND: {
28143 // Emit the node with the right value type.
28144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28146
28147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28148 // Otherwise return the value from Rand, which is always 0, casted to i32.
28149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28150 DAG.getConstant(1, dl, Op->getValueType(1)),
28151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28152 SDValue(Result.getNode(), 1)};
28153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28154
28155 // Return { result, isValid, chain }.
28156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28157 SDValue(Result.getNode(), 2));
28158 }
28159 case GATHER_AVX2: {
28160 SDValue Chain = Op.getOperand(0);
28161 SDValue Src = Op.getOperand(2);
28162 SDValue Base = Op.getOperand(3);
28163 SDValue Index = Op.getOperand(4);
28164 SDValue Mask = Op.getOperand(5);
28165 SDValue Scale = Op.getOperand(6);
28166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28167 Scale, Chain, Subtarget);
28168 }
28169 case GATHER: {
28170 //gather(v1, mask, index, base, scale);
28171 SDValue Chain = Op.getOperand(0);
28172 SDValue Src = Op.getOperand(2);
28173 SDValue Base = Op.getOperand(3);
28174 SDValue Index = Op.getOperand(4);
28175 SDValue Mask = Op.getOperand(5);
28176 SDValue Scale = Op.getOperand(6);
28177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28178 Chain, Subtarget);
28179 }
28180 case SCATTER: {
28181 //scatter(base, mask, index, v1, scale);
28182 SDValue Chain = Op.getOperand(0);
28183 SDValue Base = Op.getOperand(2);
28184 SDValue Mask = Op.getOperand(3);
28185 SDValue Index = Op.getOperand(4);
28186 SDValue Src = Op.getOperand(5);
28187 SDValue Scale = Op.getOperand(6);
28188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28189 Scale, Chain, Subtarget);
28190 }
28191 case PREFETCH: {
28192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28193 assert((HintVal == 2 || HintVal == 3) &&
28194 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28196 SDValue Chain = Op.getOperand(0);
28197 SDValue Mask = Op.getOperand(2);
28198 SDValue Index = Op.getOperand(3);
28199 SDValue Base = Op.getOperand(4);
28200 SDValue Scale = Op.getOperand(5);
28201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28202 Subtarget);
28203 }
28204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28205 case RDTSC: {
28207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28208 Results);
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 // Read Performance Monitoring Counters.
28212 case RDPMC:
28213 // Read Processor Register.
28214 case RDPRU:
28215 // GetExtended Control Register.
28216 case XGETBV: {
28218
28219 // RDPMC uses ECX to select the index of the performance counter to read.
28220 // RDPRU uses ECX to select the processor register to read.
28221 // XGETBV uses ECX to select the index of the XCR register to return.
28222 // The result is stored into registers EDX:EAX.
28223 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28224 Subtarget, Results);
28225 return DAG.getMergeValues(Results, dl);
28226 }
28227 // XTEST intrinsics.
28228 case XTEST: {
28229 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28230 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28231
28232 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28233 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28235 Ret, SDValue(InTrans.getNode(), 1));
28236 }
28239 case TRUNCATE_TO_MEM_VI32: {
28240 SDValue Mask = Op.getOperand(4);
28241 SDValue DataToTruncate = Op.getOperand(3);
28242 SDValue Addr = Op.getOperand(2);
28243 SDValue Chain = Op.getOperand(0);
28244
28246 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28247
28248 EVT MemVT = MemIntr->getMemoryVT();
28249
28250 uint16_t TruncationOp = IntrData->Opc0;
28251 switch (TruncationOp) {
28252 case X86ISD::VTRUNC: {
28253 if (isAllOnesConstant(Mask)) // return just a truncate store
28254 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28255 MemIntr->getMemOperand());
28256
28257 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28258 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28259 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28260
28261 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28262 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28263 true /* truncating */);
28264 }
28265 case X86ISD::VTRUNCUS:
28266 case X86ISD::VTRUNCS: {
28267 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28268 if (isAllOnesConstant(Mask))
28269 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28270 MemIntr->getMemOperand(), DAG);
28271
28272 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28273 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28274
28275 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28276 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28277 }
28278 default:
28279 llvm_unreachable("Unsupported truncstore intrinsic");
28280 }
28281 }
28282 case INTR_TYPE_CAST_MMX:
28283 return SDValue(); // handled in combineINTRINSIC_*
28284 }
28285}
28286
28287SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28288 SelectionDAG &DAG) const {
28289 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28290 MFI.setReturnAddressIsTaken(true);
28291
28292 unsigned Depth = Op.getConstantOperandVal(0);
28293 SDLoc dl(Op);
28294 EVT PtrVT = Op.getValueType();
28295
28296 if (Depth > 0) {
28297 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28298 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28299 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28300 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28301 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28302 MachinePointerInfo());
28303 }
28304
28305 // Just load the return address.
28306 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28307 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28308 MachinePointerInfo());
28309}
28310
28311SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28312 SelectionDAG &DAG) const {
28314 return getReturnAddressFrameIndex(DAG);
28315}
28316
28317SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28318 MachineFunction &MF = DAG.getMachineFunction();
28319 MachineFrameInfo &MFI = MF.getFrameInfo();
28320 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28321 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28322 EVT VT = Op.getValueType();
28323
28324 MFI.setFrameAddressIsTaken(true);
28325
28326 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28327 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28328 // is not possible to crawl up the stack without looking at the unwind codes
28329 // simultaneously.
28330 int FrameAddrIndex = FuncInfo->getFAIndex();
28331 if (!FrameAddrIndex) {
28332 // Set up a frame object for the return address.
28333 unsigned SlotSize = RegInfo->getSlotSize();
28334 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28335 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28336 FuncInfo->setFAIndex(FrameAddrIndex);
28337 }
28338 return DAG.getFrameIndex(FrameAddrIndex, VT);
28339 }
28340
28341 Register FrameReg =
28342 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28343 SDLoc dl(Op); // FIXME probably not meaningful
28344 unsigned Depth = Op.getConstantOperandVal(0);
28345 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28346 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28347 "Invalid Frame Register!");
28348 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28349 while (Depth--)
28350 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28351 MachinePointerInfo());
28352 return FrameAddr;
28353}
28354
28355// FIXME? Maybe this could be a TableGen attribute on some registers and
28356// this table could be generated automatically from RegInfo.
28358 const MachineFunction &MF) const {
28359 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28360
28362 .Case("esp", X86::ESP)
28363 .Case("rsp", X86::RSP)
28364 .Case("ebp", X86::EBP)
28365 .Case("rbp", X86::RBP)
28366 .Case("r14", X86::R14)
28367 .Case("r15", X86::R15)
28368 .Default(0);
28369
28370 if (Reg == X86::EBP || Reg == X86::RBP) {
28371 if (!TFI.hasFP(MF))
28372 report_fatal_error("register " + StringRef(RegName) +
28373 " is allocatable: function has no frame pointer");
28374#ifndef NDEBUG
28375 else {
28376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28379 "Invalid Frame Register!");
28380 }
28381#endif
28382 }
28383
28384 return Reg;
28385}
28386
28387SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28388 SelectionDAG &DAG) const {
28389 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28390 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28391}
28392
28394 const Constant *PersonalityFn) const {
28395 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28396 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28397
28398 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28399}
28400
28402 const Constant *PersonalityFn) const {
28403 // Funclet personalities don't use selectors (the runtime does the selection).
28405 return X86::NoRegister;
28406 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28407}
28408
28410 return Subtarget.isTargetWin64();
28411}
28412
28413SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28414 SDValue Chain = Op.getOperand(0);
28415 SDValue Offset = Op.getOperand(1);
28416 SDValue Handler = Op.getOperand(2);
28417 SDLoc dl (Op);
28418
28419 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28421 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28422 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28423 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28424 "Invalid Frame Register!");
28425 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28426 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28427
28428 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28429 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28430 dl));
28431 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28432 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28433 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28434
28435 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28436 DAG.getRegister(StoreAddrReg, PtrVT));
28437}
28438
28439SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28440 SelectionDAG &DAG) const {
28441 SDLoc DL(Op);
28442 // If the subtarget is not 64bit, we may need the global base reg
28443 // after isel expand pseudo, i.e., after CGBR pass ran.
28444 // Therefore, ask for the GlobalBaseReg now, so that the pass
28445 // inserts the code for us in case we need it.
28446 // Otherwise, we will end up in a situation where we will
28447 // reference a virtual register that is not defined!
28448 if (!Subtarget.is64Bit()) {
28449 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28450 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28451 }
28452 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28453 DAG.getVTList(MVT::i32, MVT::Other),
28454 Op.getOperand(0), Op.getOperand(1));
28455}
28456
28457SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28458 SelectionDAG &DAG) const {
28459 SDLoc DL(Op);
28460 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28461 Op.getOperand(0), Op.getOperand(1));
28462}
28463
28464SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28465 SelectionDAG &DAG) const {
28466 SDLoc DL(Op);
28467 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28468 Op.getOperand(0));
28469}
28470
28472 return Op.getOperand(0);
28473}
28474
28475SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28476 SelectionDAG &DAG) const {
28477 SDValue Root = Op.getOperand(0);
28478 SDValue Trmp = Op.getOperand(1); // trampoline
28479 SDValue FPtr = Op.getOperand(2); // nested function
28480 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28481 SDLoc dl (Op);
28482
28483 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28484 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28485
28486 if (Subtarget.is64Bit()) {
28487 SDValue OutChains[6];
28488
28489 // Large code-model.
28490 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28491 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28492
28493 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28494 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28495
28496 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28497
28498 // Load the pointer to the nested function into R11.
28499 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28500 SDValue Addr = Trmp;
28501 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28502 Addr, MachinePointerInfo(TrmpAddr));
28503
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(2, dl, MVT::i64));
28506 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28507 MachinePointerInfo(TrmpAddr, 2), Align(2));
28508
28509 // Load the 'nest' parameter value into R10.
28510 // R10 is specified in X86CallingConv.td
28511 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28512 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28513 DAG.getConstant(10, dl, MVT::i64));
28514 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28515 Addr, MachinePointerInfo(TrmpAddr, 10));
28516
28517 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28518 DAG.getConstant(12, dl, MVT::i64));
28519 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28520 MachinePointerInfo(TrmpAddr, 12), Align(2));
28521
28522 // Jump to the nested function.
28523 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28525 DAG.getConstant(20, dl, MVT::i64));
28526 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28527 Addr, MachinePointerInfo(TrmpAddr, 20));
28528
28529 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28530 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28531 DAG.getConstant(22, dl, MVT::i64));
28532 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28533 Addr, MachinePointerInfo(TrmpAddr, 22));
28534
28535 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28536 } else {
28537 const Function *Func =
28538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28539 CallingConv::ID CC = Func->getCallingConv();
28540 unsigned NestReg;
28541
28542 switch (CC) {
28543 default:
28544 llvm_unreachable("Unsupported calling convention");
28545 case CallingConv::C:
28547 // Pass 'nest' parameter in ECX.
28548 // Must be kept in sync with X86CallingConv.td
28549 NestReg = X86::ECX;
28550
28551 // Check that ECX wasn't needed by an 'inreg' parameter.
28552 FunctionType *FTy = Func->getFunctionType();
28553 const AttributeList &Attrs = Func->getAttributes();
28554
28555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28556 unsigned InRegCount = 0;
28557 unsigned Idx = 0;
28558
28559 for (FunctionType::param_iterator I = FTy->param_begin(),
28560 E = FTy->param_end(); I != E; ++I, ++Idx)
28561 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28562 const DataLayout &DL = DAG.getDataLayout();
28563 // FIXME: should only count parameters that are lowered to integers.
28564 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28565 }
28566
28567 if (InRegCount > 2) {
28568 report_fatal_error("Nest register in use - reduce number of inreg"
28569 " parameters!");
28570 }
28571 }
28572 break;
28573 }
28576 case CallingConv::Fast:
28577 case CallingConv::Tail:
28579 // Pass 'nest' parameter in EAX.
28580 // Must be kept in sync with X86CallingConv.td
28581 NestReg = X86::EAX;
28582 break;
28583 }
28584
28585 SDValue OutChains[4];
28586 SDValue Addr, Disp;
28587
28588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28589 DAG.getConstant(10, dl, MVT::i32));
28590 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28591
28592 // This is storing the opcode for MOV32ri.
28593 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28594 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28595 OutChains[0] =
28596 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28597 Trmp, MachinePointerInfo(TrmpAddr));
28598
28599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28600 DAG.getConstant(1, dl, MVT::i32));
28601 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28602 MachinePointerInfo(TrmpAddr, 1), Align(1));
28603
28604 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28606 DAG.getConstant(5, dl, MVT::i32));
28607 OutChains[2] =
28608 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28609 MachinePointerInfo(TrmpAddr, 5), Align(1));
28610
28611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28612 DAG.getConstant(6, dl, MVT::i32));
28613 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28614 MachinePointerInfo(TrmpAddr, 6), Align(1));
28615
28616 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28617 }
28618}
28619
28620SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28621 SelectionDAG &DAG) const {
28622 /*
28623 The rounding mode is in bits 11:10 of FPSR, and has the following
28624 settings:
28625 00 Round to nearest
28626 01 Round to -inf
28627 10 Round to +inf
28628 11 Round to 0
28629
28630 GET_ROUNDING, on the other hand, expects the following:
28631 -1 Undefined
28632 0 Round to 0
28633 1 Round to nearest
28634 2 Round to +inf
28635 3 Round to -inf
28636
28637 To perform the conversion, we use a packed lookup table of the four 2-bit
28638 values that we can index by FPSP[11:10]
28639 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28640
28641 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28642 */
28643
28644 MachineFunction &MF = DAG.getMachineFunction();
28645 MVT VT = Op.getSimpleValueType();
28646 SDLoc DL(Op);
28647
28648 // Save FP Control Word to stack slot
28649 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28650 SDValue StackSlot =
28651 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28652
28653 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28654
28655 SDValue Chain = Op.getOperand(0);
28656 SDValue Ops[] = {Chain, StackSlot};
28658 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28660
28661 // Load FP Control Word from stack slot
28662 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28663 Chain = CWD.getValue(1);
28664
28665 // Mask and turn the control bits into a shift for the lookup table.
28666 SDValue Shift =
28667 DAG.getNode(ISD::SRL, DL, MVT::i16,
28668 DAG.getNode(ISD::AND, DL, MVT::i16,
28669 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28670 DAG.getConstant(9, DL, MVT::i8));
28671 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28672
28673 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28674 SDValue RetVal =
28675 DAG.getNode(ISD::AND, DL, MVT::i32,
28676 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28677 DAG.getConstant(3, DL, MVT::i32));
28678
28679 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28680
28681 return DAG.getMergeValues({RetVal, Chain}, DL);
28682}
28683
28684SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28685 SelectionDAG &DAG) const {
28686 MachineFunction &MF = DAG.getMachineFunction();
28687 SDLoc DL(Op);
28688 SDValue Chain = Op.getNode()->getOperand(0);
28689
28690 // FP control word may be set only from data in memory. So we need to allocate
28691 // stack space to save/load FP control word.
28692 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28693 SDValue StackSlot =
28694 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28695 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28696 MachineMemOperand *MMO =
28698
28699 // Store FP control word into memory.
28700 SDValue Ops[] = {Chain, StackSlot};
28701 Chain = DAG.getMemIntrinsicNode(
28702 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28703
28704 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28705 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28706 Chain = CWD.getValue(1);
28707 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28708 DAG.getConstant(0xf3ff, DL, MVT::i16));
28709
28710 // Calculate new rounding mode.
28711 SDValue NewRM = Op.getNode()->getOperand(1);
28712 SDValue RMBits;
28713 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28714 uint64_t RM = CVal->getZExtValue();
28715 int FieldVal = X86::getRoundingModeX86(RM);
28716
28717 if (FieldVal == X86::rmInvalid) {
28718 FieldVal = X86::rmToNearest;
28719 LLVMContext &C = MF.getFunction().getContext();
28720 C.diagnose(DiagnosticInfoUnsupported(
28721 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28722 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28723 }
28724 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28725 } else {
28726 // Need to convert argument into bits of control word:
28727 // 0 Round to 0 -> 11
28728 // 1 Round to nearest -> 00
28729 // 2 Round to +inf -> 10
28730 // 3 Round to -inf -> 01
28731 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28732 // To make the conversion, put all these values into a value 0xc9 and shift
28733 // it left depending on the rounding mode:
28734 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28735 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28736 // ...
28737 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28738 SDValue ShiftValue =
28739 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28740 DAG.getNode(ISD::ADD, DL, MVT::i32,
28741 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28742 DAG.getConstant(1, DL, MVT::i8)),
28743 DAG.getConstant(4, DL, MVT::i32)));
28744 SDValue Shifted =
28745 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28746 ShiftValue);
28747 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28748 DAG.getConstant(0xc00, DL, MVT::i16));
28749 }
28750
28751 // Update rounding mode bits and store the new FP Control Word into stack.
28752 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28753 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28754
28755 // Load FP control word from the slot.
28756 SDValue OpsLD[] = {Chain, StackSlot};
28757 MachineMemOperand *MMOL =
28759 Chain = DAG.getMemIntrinsicNode(
28760 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28761
28762 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28763 // same way but in bits 14:13.
28764 if (Subtarget.hasSSE1()) {
28765 // Store MXCSR into memory.
28766 Chain = DAG.getNode(
28767 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28768 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28769 StackSlot);
28770
28771 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28772 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28773 Chain = CWD.getValue(1);
28774 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28775 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28776
28777 // Shift X87 RM bits from 11:10 to 14:13.
28778 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28779 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28780 DAG.getConstant(3, DL, MVT::i8));
28781
28782 // Update rounding mode bits and store the new FP Control Word into stack.
28783 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28784 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28785
28786 // Load MXCSR from the slot.
28787 Chain = DAG.getNode(
28788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28789 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28790 StackSlot);
28791 }
28792
28793 return Chain;
28794}
28795
28796const unsigned X87StateSize = 28;
28797const unsigned FPStateSize = 32;
28798[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28799
28800SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28801 SelectionDAG &DAG) const {
28803 SDLoc DL(Op);
28804 SDValue Chain = Op->getOperand(0);
28805 SDValue Ptr = Op->getOperand(1);
28807 EVT MemVT = Node->getMemoryVT();
28809 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28810
28811 // Get x87 state, if it presents.
28812 if (Subtarget.hasX87()) {
28813 Chain =
28814 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28815 {Chain, Ptr}, MemVT, MMO);
28816
28817 // FNSTENV changes the exception mask, so load back the stored environment.
28818 MachineMemOperand::Flags NewFlags =
28821 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28822 Chain =
28823 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28824 {Chain, Ptr}, MemVT, MMO);
28825 }
28826
28827 // If target supports SSE, get MXCSR as well.
28828 if (Subtarget.hasSSE1()) {
28829 // Get pointer to the MXCSR location in memory.
28831 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28832 DAG.getConstant(X87StateSize, DL, PtrVT));
28833 // Store MXCSR into memory.
28834 Chain = DAG.getNode(
28835 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28836 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28837 MXCSRAddr);
28838 }
28839
28840 return Chain;
28841}
28842
28844 EVT MemVT, MachineMemOperand *MMO,
28845 SelectionDAG &DAG,
28846 const X86Subtarget &Subtarget) {
28847 // Set x87 state, if it presents.
28848 if (Subtarget.hasX87())
28849 Chain =
28850 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28851 {Chain, Ptr}, MemVT, MMO);
28852 // If target supports SSE, set MXCSR as well.
28853 if (Subtarget.hasSSE1()) {
28854 // Get pointer to the MXCSR location in memory.
28856 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28857 DAG.getConstant(X87StateSize, DL, PtrVT));
28858 // Load MXCSR from memory.
28859 Chain = DAG.getNode(
28860 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28861 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28862 MXCSRAddr);
28863 }
28864 return Chain;
28865}
28866
28867SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28868 SelectionDAG &DAG) const {
28869 SDLoc DL(Op);
28870 SDValue Chain = Op->getOperand(0);
28871 SDValue Ptr = Op->getOperand(1);
28873 EVT MemVT = Node->getMemoryVT();
28875 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28876 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28877}
28878
28879SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28880 SelectionDAG &DAG) const {
28881 MachineFunction &MF = DAG.getMachineFunction();
28882 SDLoc DL(Op);
28883 SDValue Chain = Op.getNode()->getOperand(0);
28884
28885 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28886 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28888
28889 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28890 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28891 // for compatibility with glibc.
28892 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28893 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28894 Constant *Zero = ConstantInt::get(ItemTy, 0);
28895 for (unsigned I = 0; I < 6; ++I)
28896 FPEnvVals.push_back(Zero);
28897
28898 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28899 // all exceptions, sets DAZ and FTZ to 0.
28900 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28901 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28902 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28903 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28904 MachinePointerInfo MPI =
28906 MachineMemOperand *MMO = MF.getMachineMemOperand(
28908
28909 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28910}
28911
28912// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28913uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28914 assert((Amt < 8) && "Shift/Rotation amount out of range");
28915 switch (Opcode) {
28916 case ISD::BITREVERSE:
28917 return 0x8040201008040201ULL;
28918 case ISD::SHL:
28919 return ((0x0102040810204080ULL >> (Amt)) &
28920 (0x0101010101010101ULL * (0xFF >> (Amt))));
28921 case ISD::SRL:
28922 return ((0x0102040810204080ULL << (Amt)) &
28923 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28924 case ISD::SRA:
28925 return (getGFNICtrlImm(ISD::SRL, Amt) |
28926 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28927 case ISD::ROTL:
28928 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28929 case ISD::ROTR:
28930 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28931 }
28932 llvm_unreachable("Unsupported GFNI opcode");
28933}
28934
28935// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28936SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28937 MVT VT, unsigned Amt = 0) {
28938 assert(VT.getVectorElementType() == MVT::i8 &&
28939 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28940 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28941 SmallVector<SDValue> MaskBits;
28942 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28943 uint64_t Bits = (Imm >> (I % 64)) & 255;
28944 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28945 }
28946 return DAG.getBuildVector(VT, DL, MaskBits);
28947}
28948
28949/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28950//
28951// i8/i16 vector implemented using dword LZCNT vector instruction
28952// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28953// split the vector, perform operation on it's Lo a Hi part and
28954// concatenate the results.
28956 const X86Subtarget &Subtarget) {
28957 assert(Op.getOpcode() == ISD::CTLZ);
28958 SDLoc dl(Op);
28959 MVT VT = Op.getSimpleValueType();
28960 MVT EltVT = VT.getVectorElementType();
28961 unsigned NumElems = VT.getVectorNumElements();
28962
28963 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28964 "Unsupported element type");
28965
28966 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28967 if (NumElems > 16 ||
28968 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28969 return splitVectorIntUnary(Op, DAG, dl);
28970
28971 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28972 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28973 "Unsupported value type for operation");
28974
28975 // Use native supported vector instruction vplzcntd.
28976 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28977 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28978 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28979 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28980
28981 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28982}
28983
28984// Lower CTLZ using a PSHUFB lookup table implementation.
28986 const X86Subtarget &Subtarget,
28987 SelectionDAG &DAG) {
28988 MVT VT = Op.getSimpleValueType();
28989 int NumElts = VT.getVectorNumElements();
28990 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28991 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28992
28993 // Per-nibble leading zero PSHUFB lookup table.
28994 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28995 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28996 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28997 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28998
29000 for (int i = 0; i < NumBytes; ++i)
29001 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29002 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29003
29004 // Begin by bitcasting the input to byte vector, then split those bytes
29005 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29006 // If the hi input nibble is zero then we add both results together, otherwise
29007 // we just take the hi result (by masking the lo result to zero before the
29008 // add).
29009 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29010 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29011
29012 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29013 SDValue Lo = Op0;
29014 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29015 SDValue HiZ;
29016 if (CurrVT.is512BitVector()) {
29017 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29018 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29019 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29020 } else {
29021 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29022 }
29023
29024 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29025 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29026 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29027 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29028
29029 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29030 // of the current vector width in the same way we did for the nibbles.
29031 // If the upper half of the input element is zero then add the halves'
29032 // leading zero counts together, otherwise just use the upper half's.
29033 // Double the width of the result until we are at target width.
29034 while (CurrVT != VT) {
29035 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29036 int CurrNumElts = CurrVT.getVectorNumElements();
29037 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29038 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29039 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29040
29041 // Check if the upper half of the input element is zero.
29042 if (CurrVT.is512BitVector()) {
29043 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29044 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29045 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29046 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29047 } else {
29048 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29049 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29050 }
29051 HiZ = DAG.getBitcast(NextVT, HiZ);
29052
29053 // Move the upper/lower halves to the lower bits as we'll be extending to
29054 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29055 // together.
29056 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29057 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29058 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29059 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29060 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29061 CurrVT = NextVT;
29062 }
29063
29064 return Res;
29065}
29066
29068 const X86Subtarget &Subtarget,
29069 SelectionDAG &DAG) {
29070 MVT VT = Op.getSimpleValueType();
29071
29072 if (Subtarget.hasCDI() &&
29073 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29074 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29075 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29076
29077 // Decompose 256-bit ops into smaller 128-bit ops.
29078 if (VT.is256BitVector() && !Subtarget.hasInt256())
29079 return splitVectorIntUnary(Op, DAG, DL);
29080
29081 // Decompose 512-bit ops into smaller 256-bit ops.
29082 if (VT.is512BitVector() && !Subtarget.hasBWI())
29083 return splitVectorIntUnary(Op, DAG, DL);
29084
29085 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29086 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29087}
29088
29090 SelectionDAG &DAG,
29091 const X86Subtarget &Subtarget) {
29092 MVT VT = Op.getSimpleValueType();
29093 SDValue Input = Op.getOperand(0);
29094
29095 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29096 "Expected vXi8 input for GFNI-based CTLZ lowering");
29097
29098 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29099
29100 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29101 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29102
29103 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29104 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29105 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29106
29107 SDValue LZCNT =
29108 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29109 DAG.getTargetConstant(8, DL, MVT::i8));
29110 return LZCNT;
29111}
29112
29113static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29114 SelectionDAG &DAG) {
29115 MVT VT = Op.getSimpleValueType();
29116 MVT OpVT = VT;
29117 unsigned NumBits = VT.getSizeInBits();
29118 SDLoc dl(Op);
29119 unsigned Opc = Op.getOpcode();
29120
29121 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29122 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29123
29124 if (VT.isVector())
29125 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29126
29127 Op = Op.getOperand(0);
29128 if (VT == MVT::i8) {
29129 // Zero extend to i32 since there is not an i8 bsr.
29130 OpVT = MVT::i32;
29131 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29132 }
29133
29134 // Check if we can safely pass a result though BSR for zero sources.
29135 SDValue PassThru = DAG.getUNDEF(OpVT);
29136 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29137 !DAG.isKnownNeverZero(Op))
29138 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29139
29140 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29141 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29142 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29143
29144 // Skip CMOV if we're using a pass through value.
29145 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29146 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29147 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29148 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29149 Op.getValue(1)};
29150 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29151 }
29152
29153 // Finally xor with NumBits-1.
29154 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29155 DAG.getConstant(NumBits - 1, dl, OpVT));
29156
29157 if (VT == MVT::i8)
29158 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29159 return Op;
29160}
29161
29162static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29163 SelectionDAG &DAG) {
29164 MVT VT = Op.getSimpleValueType();
29165 unsigned NumBits = VT.getScalarSizeInBits();
29166 SDValue N0 = Op.getOperand(0);
29167 SDLoc dl(Op);
29168 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29169
29170 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29171 "Only scalar CTTZ requires custom lowering");
29172
29173 // Check if we can safely pass a result though BSF for zero sources.
29174 SDValue PassThru = DAG.getUNDEF(VT);
29175 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29176 PassThru = DAG.getConstant(NumBits, dl, VT);
29177
29178 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29179 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29180 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29181
29182 // Skip CMOV if src is never zero or we're using a pass through value.
29183 if (NonZeroSrc || !PassThru.isUndef())
29184 return Op;
29185
29186 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29187 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29188 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29189 Op.getValue(1)};
29190 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29191}
29192
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Op.getSimpleValueType();
29196 SDLoc DL(Op);
29197
29198 if (VT == MVT::i16 || VT == MVT::i32)
29199 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29200
29201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29202 return splitVectorIntBinary(Op, DAG, DL);
29203
29204 assert(Op.getSimpleValueType().is256BitVector() &&
29205 Op.getSimpleValueType().isInteger() &&
29206 "Only handle AVX 256-bit vector integer operation");
29207 return splitVectorIntBinary(Op, DAG, DL);
29208}
29209
29211 const X86Subtarget &Subtarget) {
29212 MVT VT = Op.getSimpleValueType();
29213 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29214 unsigned Opcode = Op.getOpcode();
29215 SDLoc DL(Op);
29216
29217 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29218 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29219 assert(Op.getSimpleValueType().isInteger() &&
29220 "Only handle AVX vector integer operation");
29221 return splitVectorIntBinary(Op, DAG, DL);
29222 }
29223
29224 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29226 EVT SetCCResultType =
29227 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29228
29229 unsigned BitWidth = VT.getScalarSizeInBits();
29230 if (Opcode == ISD::USUBSAT) {
29231 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29232 // Handle a special-case with a bit-hack instead of cmp+select:
29233 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29234 // If the target can use VPTERNLOG, DAGToDAG will match this as
29235 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29236 // "broadcast" constant load.
29238 if (C && C->getAPIntValue().isSignMask()) {
29239 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29240 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29241 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29242 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29243 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29244 }
29245 }
29246 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29247 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29248 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29249 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29250 // TODO: Move this to DAGCombiner?
29251 if (SetCCResultType == VT &&
29252 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29253 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29254 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29255 }
29256 }
29257
29258 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29259 (!VT.isVector() || VT == MVT::v2i64)) {
29262 SDValue Zero = DAG.getConstant(0, DL, VT);
29263 SDValue Result =
29264 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29265 DAG.getVTList(VT, SetCCResultType), X, Y);
29266 SDValue SumDiff = Result.getValue(0);
29267 SDValue Overflow = Result.getValue(1);
29268 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29269 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29270 SDValue SumNeg =
29271 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29272 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29273 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29274 }
29275
29276 // Use default expansion.
29277 return SDValue();
29278}
29279
29280static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29281 SelectionDAG &DAG) {
29282 MVT VT = Op.getSimpleValueType();
29283 SDLoc DL(Op);
29284
29285 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29286 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29287 // 8-bit integer abs to NEG and CMOV.
29288 SDValue N0 = Op.getOperand(0);
29289 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29290 DAG.getConstant(0, DL, VT), N0);
29291 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29292 SDValue(Neg.getNode(), 1)};
29293 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29294 }
29295
29296 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29297 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29298 SDValue Src = Op.getOperand(0);
29299 SDValue Neg = DAG.getNegative(Src, DL, VT);
29300 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29301 }
29302
29303 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29304 assert(VT.isInteger() &&
29305 "Only handle AVX 256-bit vector integer operation");
29306 return splitVectorIntUnary(Op, DAG, DL);
29307 }
29308
29309 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29310 return splitVectorIntUnary(Op, DAG, DL);
29311
29312 // Default to expand.
29313 return SDValue();
29314}
29315
29316static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29317 SelectionDAG &DAG) {
29318 MVT VT = Op.getSimpleValueType();
29319 SDLoc DL(Op);
29320
29321 // For AVX1 cases, split to use legal ops.
29322 if (VT.is256BitVector() && !Subtarget.hasInt256())
29323 return splitVectorIntBinary(Op, DAG, DL);
29324
29325 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29326 return splitVectorIntBinary(Op, DAG, DL);
29327
29328 // Default to expand.
29329 return SDValue();
29330}
29331
29332static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29333 SelectionDAG &DAG) {
29334 MVT VT = Op.getSimpleValueType();
29335 SDLoc DL(Op);
29336
29337 // For AVX1 cases, split to use legal ops.
29338 if (VT.is256BitVector() && !Subtarget.hasInt256())
29339 return splitVectorIntBinary(Op, DAG, DL);
29340
29341 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29342 return splitVectorIntBinary(Op, DAG, DL);
29343
29344 // Default to expand.
29345 return SDValue();
29346}
29347
29349 SelectionDAG &DAG) {
29350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29351 EVT VT = Op.getValueType();
29352 SDValue X = Op.getOperand(0);
29353 SDValue Y = Op.getOperand(1);
29354 SDLoc DL(Op);
29355 bool IsMaxOp =
29356 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29357 bool IsNum =
29358 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29359 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29360 unsigned Opc = 0;
29361 if (VT.isVector())
29363 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29365
29366 if (Opc) {
29367 SDValue Imm =
29368 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29369 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29370 }
29371 }
29372
29373 uint64_t SizeInBits = VT.getScalarSizeInBits();
29374 APInt PreferredZero = APInt::getZero(SizeInBits);
29375 APInt OppositeZero = PreferredZero;
29376 EVT IVT = VT.changeTypeToInteger();
29377 X86ISD::NodeType MinMaxOp;
29378 if (IsMaxOp) {
29379 MinMaxOp = X86ISD::FMAX;
29380 OppositeZero.setSignBit();
29381 } else {
29382 PreferredZero.setSignBit();
29383 MinMaxOp = X86ISD::FMIN;
29384 }
29385 EVT SetCCType =
29386 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29387
29388 // The tables below show the expected result of Max in cases of NaN and
29389 // signed zeros.
29390 //
29391 // Y Y
29392 // Num xNaN +0 -0
29393 // --------------- ---------------
29394 // Num | Max | Y | +0 | +0 | +0 |
29395 // X --------------- X ---------------
29396 // xNaN | X | X/Y | -0 | +0 | -0 |
29397 // --------------- ---------------
29398 //
29399 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29400 // reordering.
29401 //
29402 // We check if any of operands is NaN and return NaN. Then we check if any of
29403 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29404 // to ensure the correct zero is returned.
29405 auto MatchesZero = [](SDValue Op, APInt Zero) {
29407 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29408 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29409 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29410 return CstOp->getAPIntValue() == Zero;
29411 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29412 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29413 for (const SDValue &OpVal : Op->op_values()) {
29414 if (OpVal.isUndef())
29415 continue;
29416 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29417 if (!CstOp)
29418 return false;
29419 if (!CstOp->getValueAPF().isZero())
29420 continue;
29421 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29422 return false;
29423 }
29424 return true;
29425 }
29426 return false;
29427 };
29428
29429 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29430 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29431 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29432 Op->getFlags().hasNoSignedZeros() ||
29433 DAG.isKnownNeverZeroFloat(X) ||
29435 SDValue NewX, NewY;
29436 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29437 MatchesZero(X, OppositeZero)) {
29438 // Operands are already in right order or order does not matter.
29439 NewX = X;
29440 NewY = Y;
29441 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29442 NewX = Y;
29443 NewY = X;
29444 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29445 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29446 if (IsXNeverNaN)
29447 std::swap(X, Y);
29448 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29449 // xmm register.
29450 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29452 // Bits of classes:
29453 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29454 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29455 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29456 DL, MVT::i32);
29457 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29458 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29459 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29460 DAG.getVectorIdxConstant(0, DL));
29461 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29462 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29463 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29464 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29465 } else {
29466 SDValue IsXSigned;
29467 if (Subtarget.is64Bit() || VT != MVT::f64) {
29468 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29469 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29470 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29471 } else {
29472 assert(VT == MVT::f64);
29473 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29474 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29475 DAG.getVectorIdxConstant(0, DL));
29476 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29477 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29478 DAG.getVectorIdxConstant(1, DL));
29479 Hi = DAG.getBitcast(MVT::i32, Hi);
29480 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29481 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29482 *DAG.getContext(), MVT::i32);
29483 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29484 }
29485 if (MinMaxOp == X86ISD::FMAX) {
29486 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29487 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29488 } else {
29489 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29490 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29491 }
29492 }
29493
29494 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29495 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29496
29497 // If we did no ordering operands for signed zero handling and we need
29498 // to process NaN and we know that one of the operands is not NaN then:
29499 // - For minimum/maximum, put it in the first operand,
29500 // - For minimumnum/maximumnum, put it in the second operand,
29501 // and we will not need to post handle NaN after max/min.
29502 if (IgnoreSignedZero && !IgnoreNaN &&
29503 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29504 std::swap(NewX, NewY);
29505
29506 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29507
29508 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29509 return MinMax;
29510
29511 if (DAG.isKnownNeverNaN(NewX))
29512 NewX = NewY;
29513
29514 SDValue IsNaN =
29515 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29516
29517 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29518}
29519
29520static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29521 SelectionDAG &DAG) {
29522 MVT VT = Op.getSimpleValueType();
29523 SDLoc dl(Op);
29524
29525 // For AVX1 cases, split to use legal ops.
29526 if (VT.is256BitVector() && !Subtarget.hasInt256())
29527 return splitVectorIntBinary(Op, DAG, dl);
29528
29529 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29530 return splitVectorIntBinary(Op, DAG, dl);
29531
29532 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29534
29535 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29536 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29537 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29538
29539 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29541 if (VT.bitsGE(MVT::i32)) {
29542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29543 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29544 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29545 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29546 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29547 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29548 DAG.getTargetConstant(CC, dl, MVT::i8),
29549 Diff1.getValue(1));
29550 }
29551
29552 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29553 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29554 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29555 MVT WideVT = MVT::getIntegerVT(WideBits);
29556 if (TLI.isTypeLegal(WideVT)) {
29557 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29558 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29559 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29560 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29561 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29562 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29563 DAG.getTargetConstant(CC, dl, MVT::i8),
29564 Diff1.getValue(1));
29565 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29566 }
29567 }
29568
29569 // Default to expand.
29570 return SDValue();
29571}
29572
29573static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29574 SelectionDAG &DAG) {
29575 SDLoc dl(Op);
29576 MVT VT = Op.getSimpleValueType();
29577
29578 // Decompose 256-bit ops into 128-bit ops.
29579 if (VT.is256BitVector() && !Subtarget.hasInt256())
29580 return splitVectorIntBinary(Op, DAG, dl);
29581
29582 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29583 return splitVectorIntBinary(Op, DAG, dl);
29584
29585 SDValue A = Op.getOperand(0);
29586 SDValue B = Op.getOperand(1);
29587
29588 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29589 // vector pairs, multiply and truncate.
29590 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29591 unsigned NumElts = VT.getVectorNumElements();
29592 unsigned NumLanes = VT.getSizeInBits() / 128;
29593 unsigned NumEltsPerLane = NumElts / NumLanes;
29594
29595 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29596 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29597 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29598 return DAG.getNode(
29599 ISD::TRUNCATE, dl, VT,
29600 DAG.getNode(ISD::MUL, dl, ExVT,
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29602 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29603 }
29604
29605 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29606
29607 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29608 // Don't do this if we only need to unpack one half.
29609 if (Subtarget.hasSSSE3()) {
29610 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29611 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29612 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29613 if (BIsBuildVector) {
29614 for (auto [Idx, Val] : enumerate(B->ops())) {
29615 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29616 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29617 else
29618 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29619 }
29620 }
29621 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29622 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29623 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29624 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29625 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29626 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29627 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29628 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29629 DAG.getTargetConstant(8, dl, MVT::i8));
29630 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29631 }
29632 }
29633
29634 // Extract the lo/hi parts to any extend to i16.
29635 // We're going to mask off the low byte of each result element of the
29636 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29637 // element.
29638 SDValue Undef = DAG.getUNDEF(VT);
29639 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29640 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29641
29642 SDValue BLo, BHi;
29643 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29644 // If the RHS is a constant, manually unpackl/unpackh.
29645 SmallVector<SDValue, 16> LoOps, HiOps;
29646 for (unsigned i = 0; i != NumElts; i += 16) {
29647 for (unsigned j = 0; j != 8; ++j) {
29648 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29649 MVT::i16));
29650 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29651 MVT::i16));
29652 }
29653 }
29654
29655 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29656 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29657 } else {
29658 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29659 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29660 }
29661
29662 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29663 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29664 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29665 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29666 }
29667
29668 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29669 if (VT == MVT::v4i32) {
29670 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29671 "Should not custom lower when pmulld is available!");
29672
29673 // Extract the odd parts.
29674 static const int UnpackMask[] = {1, 1, 3, 3};
29675 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29676 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29677
29678 // Multiply the even parts.
29679 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29680 DAG.getBitcast(MVT::v2i64, A),
29681 DAG.getBitcast(MVT::v2i64, B));
29682 // Now multiply odd parts.
29683 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29684 DAG.getBitcast(MVT::v2i64, Aodds),
29685 DAG.getBitcast(MVT::v2i64, Bodds));
29686
29687 Evens = DAG.getBitcast(VT, Evens);
29688 Odds = DAG.getBitcast(VT, Odds);
29689
29690 // Merge the two vectors back together with a shuffle. This expands into 2
29691 // shuffles.
29692 static const int ShufMask[] = { 0, 4, 2, 6 };
29693 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29694 }
29695
29696 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29697 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29698 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29699
29700 // Ahi = psrlqi(a, 32);
29701 // Bhi = psrlqi(b, 32);
29702 //
29703 // AloBlo = pmuludq(a, b);
29704 // AloBhi = pmuludq(a, Bhi);
29705 // AhiBlo = pmuludq(Ahi, b);
29706 //
29707 // Hi = psllqi(AloBhi + AhiBlo, 32);
29708 // return AloBlo + Hi;
29709 KnownBits AKnown = DAG.computeKnownBits(A);
29710 KnownBits BKnown = DAG.computeKnownBits(B);
29711
29712 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29713 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29714 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29715
29716 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29717 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29718 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29719
29720 SDValue Zero = DAG.getConstant(0, dl, VT);
29721
29722 // Only multiply lo/hi halves that aren't known to be zero.
29723 SDValue AloBlo = Zero;
29724 if (!ALoIsZero && !BLoIsZero)
29725 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29726
29727 SDValue AloBhi = Zero;
29728 if (!ALoIsZero && !BHiIsZero) {
29729 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29730 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29731 }
29732
29733 SDValue AhiBlo = Zero;
29734 if (!AHiIsZero && !BLoIsZero) {
29735 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29736 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29737 }
29738
29739 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29740 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29741
29742 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29743}
29744
29746 MVT VT, bool IsSigned,
29747 const X86Subtarget &Subtarget,
29748 SelectionDAG &DAG,
29749 SDValue *Low = nullptr) {
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29753 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29754 // lane results back together.
29755
29756 // We'll take different approaches for signed and unsigned.
29757 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29758 // and use pmullw to calculate the full 16-bit product.
29759 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29760 // shift them left into the upper byte of each word. This allows us to use
29761 // pmulhw to calculate the full 16-bit product. This trick means we don't
29762 // need to sign extend the bytes to use pmullw.
29763
29764 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29765 SDValue Zero = DAG.getConstant(0, dl, VT);
29766
29767 SDValue ALo, AHi;
29768 if (IsSigned) {
29769 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29770 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29771 } else {
29772 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29773 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29774 }
29775
29776 SDValue BLo, BHi;
29777 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29778 // If the RHS is a constant, manually unpackl/unpackh and extend.
29779 SmallVector<SDValue, 16> LoOps, HiOps;
29780 for (unsigned i = 0; i != NumElts; i += 16) {
29781 for (unsigned j = 0; j != 8; ++j) {
29782 SDValue LoOp = B.getOperand(i + j);
29783 SDValue HiOp = B.getOperand(i + j + 8);
29784
29785 if (IsSigned) {
29786 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29787 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29788 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29789 DAG.getConstant(8, dl, MVT::i16));
29790 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29791 DAG.getConstant(8, dl, MVT::i16));
29792 } else {
29793 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29794 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29795 }
29796
29797 LoOps.push_back(LoOp);
29798 HiOps.push_back(HiOp);
29799 }
29800 }
29801
29802 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29803 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29804 } else if (IsSigned) {
29805 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29806 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29807 } else {
29808 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29809 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29810 }
29811
29812 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29813 // pack back to vXi8.
29814 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29815 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29816 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29817
29818 if (Low)
29819 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29820
29821 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29822}
29823
29824static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29825 SelectionDAG &DAG) {
29826 SDLoc dl(Op);
29827 MVT VT = Op.getSimpleValueType();
29828 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29829 unsigned NumElts = VT.getVectorNumElements();
29830 SDValue A = Op.getOperand(0);
29831 SDValue B = Op.getOperand(1);
29832
29833 // Decompose 256-bit ops into 128-bit ops.
29834 if (VT.is256BitVector() && !Subtarget.hasInt256())
29835 return splitVectorIntBinary(Op, DAG, dl);
29836
29837 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29838 return splitVectorIntBinary(Op, DAG, dl);
29839
29840 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29841 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29842 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29843 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29844
29845 // PMULxD operations multiply each even value (starting at 0) of LHS with
29846 // the related value of RHS and produce a widen result.
29847 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 //
29850 // In other word, to have all the results, we need to perform two PMULxD:
29851 // 1. one with the even values.
29852 // 2. one with the odd values.
29853 // To achieve #2, with need to place the odd values at an even position.
29854 //
29855 // Place the odd value at an even position (basically, shift all values 1
29856 // step to the left):
29857 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29858 9, -1, 11, -1, 13, -1, 15, -1};
29859 // <a|b|c|d> => <b|undef|d|undef>
29860 SDValue Odd0 =
29861 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29862 // <e|f|g|h> => <f|undef|h|undef>
29863 SDValue Odd1 =
29864 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29865
29866 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29867 // ints.
29868 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29869 unsigned Opcode =
29870 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29871 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29872 // => <2 x i64> <ae|cg>
29873 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29874 DAG.getBitcast(MulVT, A),
29875 DAG.getBitcast(MulVT, B)));
29876 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29877 // => <2 x i64> <bf|dh>
29878 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29879 DAG.getBitcast(MulVT, Odd0),
29880 DAG.getBitcast(MulVT, Odd1)));
29881
29882 // Shuffle it back into the right order.
29883 SmallVector<int, 16> ShufMask(NumElts);
29884 for (int i = 0; i != (int)NumElts; ++i)
29885 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29886
29887 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29888
29889 // If we have a signed multiply but no PMULDQ fix up the result of an
29890 // unsigned multiply.
29891 if (IsSigned && !Subtarget.hasSSE41()) {
29892 SDValue Zero = DAG.getConstant(0, dl, VT);
29893 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29894 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29895 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29896 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29897
29898 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29899 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29900 }
29901
29902 return Res;
29903 }
29904
29905 // Only i8 vectors should need custom lowering after this.
29906 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29907 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29908 "Unsupported vector type");
29909
29910 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29911 // logical shift down the upper half and pack back to i8.
29912
29913 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29914 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29915
29916 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29917 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29918 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29919 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29920 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29921 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29922 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29923 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29924 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29925 }
29926
29927 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29928}
29929
29930// Custom lowering for SMULO/UMULO.
29931static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29932 SelectionDAG &DAG) {
29933 MVT VT = Op.getSimpleValueType();
29934
29935 // Scalars defer to LowerXALUO.
29936 if (!VT.isVector())
29937 return LowerXALUO(Op, DAG);
29938
29939 SDLoc dl(Op);
29940 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29941 SDValue A = Op.getOperand(0);
29942 SDValue B = Op.getOperand(1);
29943 EVT OvfVT = Op->getValueType(1);
29944
29945 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29946 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29947 // Extract the LHS Lo/Hi vectors
29948 SDValue LHSLo, LHSHi;
29949 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29950
29951 // Extract the RHS Lo/Hi vectors
29952 SDValue RHSLo, RHSHi;
29953 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29954
29955 EVT LoOvfVT, HiOvfVT;
29956 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29957 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29958 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29959
29960 // Issue the split operations.
29961 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29962 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29963
29964 // Join the separate data results and the overflow results.
29965 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29966 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29967 Hi.getValue(1));
29968
29969 return DAG.getMergeValues({Res, Ovf}, dl);
29970 }
29971
29972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29973 EVT SetccVT =
29974 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29975
29976 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29977 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29978 unsigned NumElts = VT.getVectorNumElements();
29979 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29980 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29981 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29982 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29983 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29984
29985 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29986
29987 SDValue Ovf;
29988 if (IsSigned) {
29989 SDValue High, LowSign;
29990 if (OvfVT.getVectorElementType() == MVT::i1 &&
29991 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29992 // Rather the truncating try to do the compare on vXi16 or vXi32.
29993 // Shift the high down filling with sign bits.
29994 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29995 // Fill all 16 bits with the sign bit from the low.
29996 LowSign =
29997 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29998 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29999 15, DAG);
30000 SetccVT = OvfVT;
30001 if (!Subtarget.hasBWI()) {
30002 // We can't do a vXi16 compare so sign extend to v16i32.
30003 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30004 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30005 }
30006 } else {
30007 // Otherwise do the compare at vXi8.
30008 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30009 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30010 LowSign =
30011 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30012 }
30013
30014 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30015 } else {
30016 SDValue High =
30017 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30018 if (OvfVT.getVectorElementType() == MVT::i1 &&
30019 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30020 // Rather the truncating try to do the compare on vXi16 or vXi32.
30021 SetccVT = OvfVT;
30022 if (!Subtarget.hasBWI()) {
30023 // We can't do a vXi16 compare so sign extend to v16i32.
30024 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30025 }
30026 } else {
30027 // Otherwise do the compare at vXi8.
30028 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30029 }
30030
30031 Ovf =
30032 DAG.getSetCC(dl, SetccVT, High,
30033 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30034 }
30035
30036 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30037
30038 return DAG.getMergeValues({Low, Ovf}, dl);
30039 }
30040
30041 SDValue Low;
30042 SDValue High =
30043 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30044
30045 SDValue Ovf;
30046 if (IsSigned) {
30047 // SMULO overflows if the high bits don't match the sign of the low.
30048 SDValue LowSign =
30049 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30050 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30051 } else {
30052 // UMULO overflows if the high bits are non-zero.
30053 Ovf =
30054 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30055 }
30056
30057 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30058
30059 return DAG.getMergeValues({Low, Ovf}, dl);
30060}
30061
30062SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30063 assert(Subtarget.isTargetWin64() && "Unexpected target");
30064 EVT VT = Op.getValueType();
30065 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30066 "Unexpected return type for lowering");
30067
30068 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30070 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30071 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30072 }
30073
30074 RTLIB::Libcall LC;
30075 bool isSigned;
30076 switch (Op->getOpcode()) {
30077 // clang-format off
30078 default: llvm_unreachable("Unexpected request for libcall!");
30079 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30080 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30081 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30082 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30083 // clang-format on
30084 }
30085
30086 SDLoc dl(Op);
30087 SDValue InChain = DAG.getEntryNode();
30088
30090 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30091 EVT ArgVT = Op->getOperand(i).getValueType();
30092 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30093 "Unexpected argument type for lowering");
30094 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30095 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30096 MachinePointerInfo MPI =
30098 InChain =
30099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30100 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30101 }
30102
30105
30106 TargetLowering::CallLoweringInfo CLI(DAG);
30107 CLI.setDebugLoc(dl)
30108 .setChain(InChain)
30109 .setLibCallee(
30111 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30112 std::move(Args))
30113 .setInRegister()
30114 .setSExtResult(isSigned)
30115 .setZExtResult(!isSigned);
30116
30117 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30118 return DAG.getBitcast(VT, CallInfo.first);
30119}
30120
30121SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30122 SelectionDAG &DAG,
30123 SDValue &Chain) const {
30124 assert(Subtarget.isTargetWin64() && "Unexpected target");
30125 EVT VT = Op.getValueType();
30126 bool IsStrict = Op->isStrictFPOpcode();
30127
30128 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30129 EVT ArgVT = Arg.getValueType();
30130
30131 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30132 "Unexpected return type for lowering");
30133
30134 RTLIB::Libcall LC;
30135 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30136 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30137 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30138 else
30139 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30140 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30141
30142 SDLoc dl(Op);
30143 MakeLibCallOptions CallOptions;
30144 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30145
30147 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30148 // expected VT (i128).
30149 std::tie(Result, Chain) =
30150 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30151 Result = DAG.getBitcast(VT, Result);
30152 return Result;
30153}
30154
30155SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30156 SelectionDAG &DAG) const {
30157 assert(Subtarget.isTargetWin64() && "Unexpected target");
30158 EVT VT = Op.getValueType();
30159 bool IsStrict = Op->isStrictFPOpcode();
30160
30161 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30162 EVT ArgVT = Arg.getValueType();
30163
30164 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30165 "Unexpected argument type for lowering");
30166
30167 RTLIB::Libcall LC;
30168 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30169 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30170 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30171 else
30172 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30173 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30174
30175 SDLoc dl(Op);
30176 MakeLibCallOptions CallOptions;
30177 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30178
30179 // Pass the i128 argument as an indirect argument on the stack.
30180 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30181 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30182 MachinePointerInfo MPI =
30184 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30185
30187 std::tie(Result, Chain) =
30188 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30189 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30190}
30191
30192// Return true if the required (according to Opcode) shift-imm form is natively
30193// supported by the Subtarget
30194static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30195 unsigned Opcode) {
30196 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30197 "Unexpected shift opcode");
30198
30199 if (!VT.isSimple())
30200 return false;
30201
30202 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30203 return false;
30204
30205 if (VT.getScalarSizeInBits() < 16)
30206 return false;
30207
30208 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30209 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30210 return true;
30211
30212 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30213 (VT.is256BitVector() && Subtarget.hasInt256());
30214
30215 bool AShift = LShift && (Subtarget.hasAVX512() ||
30216 (VT != MVT::v2i64 && VT != MVT::v4i64));
30217 return (Opcode == ISD::SRA) ? AShift : LShift;
30218}
30219
30220// The shift amount is a variable, but it is the same for all vector lanes.
30221// These instructions are defined together with shift-immediate.
30222static
30224 unsigned Opcode) {
30225 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30226}
30227
30228// Return true if the required (according to Opcode) variable-shift form is
30229// natively supported by the Subtarget
30230static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30231 unsigned Opcode) {
30232 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30233 "Unexpected shift opcode");
30234
30235 if (!VT.isSimple())
30236 return false;
30237
30238 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30239 return false;
30240
30241 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30242 return false;
30243
30244 // vXi16 supported only on AVX-512, BWI
30245 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30246 return false;
30247
30248 if (Subtarget.hasAVX512() &&
30249 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30250 return true;
30251
30252 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30253 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30254 return (Opcode == ISD::SRA) ? AShift : LShift;
30255}
30256
30258 const X86Subtarget &Subtarget) {
30259 MVT VT = Op.getSimpleValueType();
30260 SDLoc dl(Op);
30261 SDValue R = Op.getOperand(0);
30262 SDValue Amt = Op.getOperand(1);
30263 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30264 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30265
30266 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30267 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30268 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30269 SDValue Ex = DAG.getBitcast(ExVT, R);
30270
30271 // ashr(R, 63) === cmp_slt(R, 0)
30272 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30273 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30274 "Unsupported PCMPGT op");
30275 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30276 }
30277
30278 if (ShiftAmt >= 32) {
30279 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30280 SDValue Upper =
30281 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30283 ShiftAmt - 32, DAG);
30284 if (VT == MVT::v2i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30286 if (VT == MVT::v4i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30288 {9, 1, 11, 3, 13, 5, 15, 7});
30289 } else {
30290 // SRA upper i32, SRL whole i64 and select lower i32.
30292 ShiftAmt, DAG);
30293 SDValue Lower =
30294 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30295 Lower = DAG.getBitcast(ExVT, Lower);
30296 if (VT == MVT::v2i64)
30297 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30298 if (VT == MVT::v4i64)
30299 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30300 {8, 1, 10, 3, 12, 5, 14, 7});
30301 }
30302 return DAG.getBitcast(VT, Ex);
30303 };
30304
30305 // Optimize shl/srl/sra with constant shift amount.
30306 APInt APIntShiftAmt;
30307 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30308 return SDValue();
30309
30310 // If the shift amount is out of range, return undef.
30311 if (APIntShiftAmt.uge(EltSizeInBits))
30312 return DAG.getUNDEF(VT);
30313
30314 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30315
30316 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30317 // Hardware support for vector shifts is sparse which makes us scalarize the
30318 // vector operations in many cases. Also, on sandybridge ADD is faster than
30319 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30320 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30321 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30322 // must be 0). (add undef, undef) however can be any value. To make this
30323 // safe, we must freeze R to ensure that register allocation uses the same
30324 // register for an undefined value. This ensures that the result will
30325 // still be even and preserves the original semantics.
30326 R = DAG.getFreeze(R);
30327 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30328 }
30329
30330 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30331 }
30332
30333 // i64 SRA needs to be performed as partial shifts.
30334 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30335 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30336 Op.getOpcode() == ISD::SRA)
30337 return ArithmeticShiftRight64(ShiftAmt);
30338
30339 // If we're logical shifting an all-signbits value then we can just perform as
30340 // a mask.
30341 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30342 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30343 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30344 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30345 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30346 }
30347
30348 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30349 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30350 unsigned NumElts = VT.getVectorNumElements();
30351 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30352
30353 // Simple i8 add case
30354 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30355 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30356 // must be 0). (add undef, undef) however can be any value. To make this
30357 // safe, we must freeze R to ensure that register allocation uses the same
30358 // register for an undefined value. This ensures that the result will
30359 // still be even and preserves the original semantics.
30360 R = DAG.getFreeze(R);
30361 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30362 }
30363
30364 // ashr(R, 7) === cmp_slt(R, 0)
30365 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30366 SDValue Zeros = DAG.getConstant(0, dl, VT);
30367 if (VT.is512BitVector()) {
30368 assert(VT == MVT::v64i8 && "Unexpected element type!");
30369 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30370 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30371 }
30372 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30373 }
30374
30375 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30376 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30377 return SDValue();
30378
30379 if (Subtarget.hasGFNI()) {
30380 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30381 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30382 DAG.getTargetConstant(0, dl, MVT::i8));
30383 }
30384
30385 if (Op.getOpcode() == ISD::SHL) {
30386 // Make a large shift.
30387 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30388 ShiftAmt, DAG);
30389 SHL = DAG.getBitcast(VT, SHL);
30390 // Zero out the rightmost bits.
30391 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30392 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30393 }
30394 if (Op.getOpcode() == ISD::SRL) {
30395 // Make a large shift.
30396 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30397 ShiftAmt, DAG);
30398 SRL = DAG.getBitcast(VT, SRL);
30399 // Zero out the leftmost bits.
30400 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30401 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30402 }
30403 if (Op.getOpcode() == ISD::SRA) {
30404 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30405 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30406
30407 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30408 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30409 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30410 return Res;
30411 }
30412 llvm_unreachable("Unknown shift opcode.");
30413 }
30414
30415 return SDValue();
30416}
30417
30419 const X86Subtarget &Subtarget) {
30420 MVT VT = Op.getSimpleValueType();
30421 SDLoc dl(Op);
30422 SDValue R = Op.getOperand(0);
30423 SDValue Amt = Op.getOperand(1);
30424 unsigned Opcode = Op.getOpcode();
30425 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30426
30427 int BaseShAmtIdx = -1;
30428 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30429 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30430 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30431 Subtarget, DAG);
30432
30433 // vXi8 shifts - shift as v8i16 + mask result.
30434 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30435 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30436 VT == MVT::v64i8) &&
30437 !Subtarget.hasXOP()) {
30438 unsigned NumElts = VT.getVectorNumElements();
30439 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30440 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30441 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30442 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30443
30444 // Create the mask using vXi16 shifts. For shift-rights we need to move
30445 // the upper byte down before splatting the vXi8 mask.
30446 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30447 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30448 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30449 if (Opcode != ISD::SHL)
30450 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30451 8, DAG);
30452 BitMask = DAG.getBitcast(VT, BitMask);
30453 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30454 SmallVector<int, 64>(NumElts, 0));
30455
30456 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30457 DAG.getBitcast(ExtVT, R), BaseShAmt,
30458 BaseShAmtIdx, Subtarget, DAG);
30459 Res = DAG.getBitcast(VT, Res);
30460 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30461
30462 if (Opcode == ISD::SRA) {
30463 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30464 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30465 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30466 SignMask =
30467 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30468 BaseShAmtIdx, Subtarget, DAG);
30469 SignMask = DAG.getBitcast(VT, SignMask);
30470 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30471 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30472 }
30473 return Res;
30474 }
30475 }
30476 }
30477
30478 return SDValue();
30479}
30480
30481// Convert a shift/rotate left amount to a multiplication scale factor.
30483 const X86Subtarget &Subtarget,
30484 SelectionDAG &DAG) {
30485 MVT VT = Amt.getSimpleValueType();
30486 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30487 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30488 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30489 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30490 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30491 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30492 return SDValue();
30493
30494 MVT SVT = VT.getVectorElementType();
30495 unsigned SVTBits = SVT.getSizeInBits();
30496 unsigned NumElems = VT.getVectorNumElements();
30497
30498 APInt UndefElts;
30499 SmallVector<APInt> EltBits;
30500 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30501 APInt One(SVTBits, 1);
30502 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30503 for (unsigned I = 0; I != NumElems; ++I) {
30504 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30505 continue;
30506 uint64_t ShAmt = EltBits[I].getZExtValue();
30507 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30508 }
30509 return DAG.getBuildVector(VT, dl, Elts);
30510 }
30511
30512 // If the target doesn't support variable shifts, use either FP conversion
30513 // or integer multiplication to avoid shifting each element individually.
30514 if (VT == MVT::v4i32) {
30515 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30516 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30517 DAG.getConstant(0x3f800000U, dl, VT));
30518 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30519 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30520 }
30521
30522 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30523 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30524 SDValue Z = DAG.getConstant(0, dl, VT);
30525 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30526 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30527 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30528 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30529 if (Subtarget.hasSSE41())
30530 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30531 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30532 }
30533
30534 return SDValue();
30535}
30536
30537static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30538 SelectionDAG &DAG) {
30539 MVT VT = Op.getSimpleValueType();
30540 SDLoc dl(Op);
30541 SDValue R = Op.getOperand(0);
30542 SDValue Amt = Op.getOperand(1);
30543 unsigned NumElts = VT.getVectorNumElements();
30544 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30545 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30546
30547 unsigned Opc = Op.getOpcode();
30548 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30549 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30550
30551 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30552 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30553
30554 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30555 return V;
30556
30557 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30558 return V;
30559
30560 if (supportedVectorVarShift(VT, Subtarget, Opc))
30561 return Op;
30562
30563 // i64 vector arithmetic shift can be emulated with the transform:
30564 // M = lshr(SIGN_MASK, Amt)
30565 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30566 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30567 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30568 Opc == ISD::SRA) {
30569 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30570 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30571 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30572 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30573 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30574 return R;
30575 }
30576
30577 // XOP has 128-bit variable logical/arithmetic shifts.
30578 // +ve/-ve Amt = shift left/right.
30579 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30580 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30581 if (Opc == ISD::SRL || Opc == ISD::SRA)
30582 Amt = DAG.getNegative(Amt, dl, VT);
30583 if (Opc == ISD::SHL || Opc == ISD::SRL)
30584 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30585 if (Opc == ISD::SRA)
30586 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30587 }
30588
30589 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30590 // shifts per-lane and then shuffle the partial results back together.
30591 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30592 // Splat the shift amounts so the scalar shifts above will catch it.
30593 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30594 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30595 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30596 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30597 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30598 }
30599
30600 // Build a map of inrange constant amounts with element mask where they occur.
30602 if (ConstantAmt) {
30603 for (unsigned I = 0; I != NumElts; ++I) {
30604 SDValue A = Amt.getOperand(I);
30605 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30606 continue;
30607 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30608 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30609 if (!Inserted) {
30610 It->second.setBit(I);
30611 continue;
30612 }
30613 It->second = APInt::getOneBitSet(NumElts, I);
30614 }
30615 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30616 }
30617
30618 // If possible, lower this shift as a sequence of two shifts by
30619 // constant plus a BLENDing shuffle instead of scalarizing it.
30620 // Example:
30621 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30622 //
30623 // Could be rewritten as:
30624 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30625 //
30626 // The advantage is that the two shifts from the example would be
30627 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30628 if (UniqueCstAmt.size() == 2 &&
30629 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30630 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30631 unsigned AmtA = UniqueCstAmt.begin()->first;
30632 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30633 const APInt &MaskA = UniqueCstAmt.begin()->second;
30634 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30635 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30636 for (unsigned I = 0; I != NumElts; ++I) {
30637 if (MaskA[I])
30638 ShuffleMask[I] = I;
30639 if (MaskB[I])
30640 ShuffleMask[I] = I + NumElts;
30641 }
30642
30643 // Only perform this blend if we can perform it without loading a mask.
30644 if ((VT != MVT::v16i16 ||
30645 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30646 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30647 canWidenShuffleElements(ShuffleMask))) {
30648 SDValue Shift1 =
30649 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30650 SDValue Shift2 =
30651 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30652 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30653 }
30654 }
30655
30656 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30657 // using vYiM vector operations where X*N == Y*M and M > N.
30658 if (ConstantAmt &&
30659 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30660 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30661 !Subtarget.hasXOP()) {
30662 MVT NarrowScalarVT = VT.getScalarType();
30663 // We can do this extra fast if each pair of narrow elements is shifted by
30664 // the same amount by doing this SWAR style: use a shift to move the valid
30665 // bits to the right position, mask out any bits which crossed from one
30666 // element to the other.
30667 // This optimized lowering is only valid if the elements in a pair can
30668 // be treated identically.
30669 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30670 SmallVector<SDValue, 32> TmpAmtWideElts;
30671 int WideEltSizeInBits = EltSizeInBits;
30672 while (WideEltSizeInBits < 32) {
30673 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30674 // unprofitable.
30675 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30676 break;
30677 }
30678 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30679 bool SameShifts = true;
30680 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30681 unsigned DstI = SrcI / 2;
30682 // Both elements are undef? Make a note and keep going.
30683 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30684 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30685 continue;
30686 }
30687 // Even element is undef? We will shift it by the same shift amount as
30688 // the odd element.
30689 if (AmtWideElts[SrcI].isUndef()) {
30690 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30691 continue;
30692 }
30693 // Odd element is undef? We will shift it by the same shift amount as
30694 // the even element.
30695 if (AmtWideElts[SrcI + 1].isUndef()) {
30696 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30697 continue;
30698 }
30699 // Both elements are equal.
30700 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30701 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30702 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30703 continue;
30704 }
30705 // One of the provisional wide elements will not have the same shift
30706 // amount. Let's bail.
30707 SameShifts = false;
30708 break;
30709 }
30710 if (!SameShifts) {
30711 break;
30712 }
30713 WideEltSizeInBits *= 2;
30714 std::swap(TmpAmtWideElts, AmtWideElts);
30715 }
30716 APInt APIntShiftAmt;
30717 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30718 bool Profitable = WidenShift;
30719 // AVX512BW brings support for vpsllvw.
30720 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30721 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30722 Profitable = false;
30723 }
30724 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30725 // fairly cheaply in other ways.
30726 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30727 Profitable = false;
30728 }
30729 // Leave it up to GFNI if we have it around.
30730 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30731 // is probably a win to use other strategies in some cases.
30732 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30733 Profitable = false;
30734 }
30735
30736 // AVX1 does not have vpand which makes our masking impractical. It does
30737 // have vandps but that is an FP instruction and crossing FP<->int typically
30738 // has some cost.
30739 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30740 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30741 Profitable = false;
30742 }
30743 unsigned WideNumElts = AmtWideElts.size();
30744 // We are only dealing with identical pairs.
30745 if (Profitable && WideNumElts != NumElts) {
30746 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30747 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30748 // Cast the operand to vXiM.
30749 SDValue RWide = DAG.getBitcast(WideVT, R);
30750 // Create our new vector of shift amounts.
30751 SDValue AmtWide = DAG.getBuildVector(
30752 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30753 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30754 // Perform the actual shift.
30755 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30756 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30757 // Now we need to construct a mask which will "drop" bits that get
30758 // shifted past the LSB/MSB. For a logical shift left, it will look
30759 // like:
30760 // FullMask = (1 << EltSizeInBits) - 1
30761 // Mask = FullMask << Amt
30762 //
30763 // This masking ensures that bits cannot migrate from one narrow lane to
30764 // another. The construction of this mask will be constant folded.
30765 // The mask for a logical right shift is nearly identical, the only
30766 // difference is that the all ones mask is shifted right instead of left.
30767 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30768 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30769 Mask = DAG.getBitcast(WideVT, Mask);
30770 // Finally, we mask the shifted vector with the SWAR mask.
30771 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30772 Masked = DAG.getBitcast(VT, Masked);
30773 if (Opc != ISD::SRA) {
30774 // Logical shifts are complete at this point.
30775 return Masked;
30776 }
30777 // At this point, we have done a *logical* shift right. We now need to
30778 // sign extend the result so that we get behavior equivalent to an
30779 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30780 // are `EltSizeInBits-AmtWide` bits wide.
30781 //
30782 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30783 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30784 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30785 // can use the following trick to accomplish this:
30786 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30787 // (Masked ^ SignBitMask) - SignBitMask
30788 //
30789 // When the sign bit is already clear, this will compute:
30790 // Masked + SignBitMask - SignBitMask
30791 //
30792 // This is equal to Masked which is what we want: the sign bit was clear
30793 // so sign extending should be a no-op.
30794 //
30795 // When the sign bit is set, this will compute:
30796 // Masked - SignBitmask - SignBitMask
30797 //
30798 // This is equal to Masked - 2*SignBitMask which will correctly sign
30799 // extend our result.
30800 SDValue SplatHighBit =
30801 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30802 // This does not induce recursion, all operands are constants.
30803 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30804 SDValue FlippedSignBit =
30805 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30806 SDValue Subtraction =
30807 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30808 return Subtraction;
30809 }
30810 }
30811
30812 // If possible, lower this packed shift into a vector multiply instead of
30813 // expanding it into a sequence of scalar shifts.
30814 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30815 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30816 Subtarget.canExtendTo512BW())))
30817 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30818 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30819
30820 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30821 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30822 if (Opc == ISD::SRL && ConstantAmt &&
30823 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30824 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30825 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30826 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30827 SDValue Zero = DAG.getConstant(0, dl, VT);
30828 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30829 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30830 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30831 }
30832 }
30833
30834 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30835 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30836 // TODO: Special case handling for shift by 0/1, really we can afford either
30837 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30838 if (Opc == ISD::SRA && ConstantAmt &&
30839 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30840 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30841 !Subtarget.hasAVX512()) ||
30842 DAG.isKnownNeverZero(Amt))) {
30843 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30844 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30845 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30846 SDValue Amt0 =
30847 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30848 SDValue Amt1 =
30849 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30850 SDValue Sra1 =
30851 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30852 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30853 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30854 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30855 }
30856 }
30857
30858 // v4i32 Non Uniform Shifts.
30859 // If the shift amount is constant we can shift each lane using the SSE2
30860 // immediate shifts, else we need to zero-extend each lane to the lower i64
30861 // and shift using the SSE2 variable shifts.
30862 // The separate results can then be blended together.
30863 if (VT == MVT::v4i32) {
30864 SDValue Amt0, Amt1, Amt2, Amt3;
30865 if (ConstantAmt) {
30866 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30867 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30868 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30869 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30870 } else {
30871 // The SSE2 shifts use the lower i64 as the same shift amount for
30872 // all lanes and the upper i64 is ignored. On AVX we're better off
30873 // just zero-extending, but for SSE just duplicating the top 16-bits is
30874 // cheaper and has the same effect for out of range values.
30875 if (Subtarget.hasAVX()) {
30876 SDValue Z = DAG.getConstant(0, dl, VT);
30877 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30878 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30879 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30880 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30881 } else {
30882 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30883 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30884 {4, 5, 6, 7, -1, -1, -1, -1});
30885 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30886 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30887 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30888 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30889 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30890 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30891 }
30892 }
30893
30894 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30895 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30896 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30897 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30898 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30899
30900 // Merge the shifted lane results optimally with/without PBLENDW.
30901 // TODO - ideally shuffle combining would handle this.
30902 if (Subtarget.hasSSE41()) {
30903 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30904 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30905 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30906 }
30907 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30908 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30909 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30910 }
30911
30912 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30913 // look up the pre-computed shift values.
30914 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30915 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30916 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30917 unsigned NumLanes = VT.getSizeInBits() / 128u;
30918 unsigned NumEltsPerLane = NumElts / NumLanes;
30920 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30921 unsigned LoElt = Lane * NumEltsPerLane;
30922 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30923 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30924 if (!KnownLane.isConstant())
30925 break;
30926 const APInt &LaneSplat = KnownLane.getConstant();
30927 for (unsigned I = 0; I != 8; ++I) {
30928 if (Opc == ISD::SHL)
30929 LUT.push_back(LaneSplat.shl(I));
30930 else if (Opc == ISD::SRL)
30931 LUT.push_back(LaneSplat.lshr(I));
30932 else if (Opc == ISD::SRA)
30933 LUT.push_back(LaneSplat.ashr(I));
30934 }
30935 LUT.append(8, APInt::getZero(8));
30936 }
30937 if (LUT.size() == NumElts) {
30938 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30939 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30940 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30941 }
30942 }
30943
30944 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30945 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30946 // make the existing SSE solution better.
30947 // NOTE: We honor prefered vector width before promoting to 512-bits.
30948 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30949 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30950 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30951 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30952 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30953 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30954 "Unexpected vector type");
30955 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30956 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30957 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30958 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30959 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30960 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30961 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30962 }
30963
30964 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30965 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30966 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30967 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30968 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30969 !Subtarget.hasXOP()) {
30970 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30971 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30972
30973 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30974 // isn't legal).
30975 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30976 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30977 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30978 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30980 "Constant build vector expected");
30981
30982 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30983 bool IsSigned = Opc == ISD::SRA;
30984 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30985 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30986 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30987 return DAG.getZExtOrTrunc(R, dl, VT);
30988 }
30989
30990 SmallVector<SDValue, 16> LoAmt, HiAmt;
30991 for (unsigned i = 0; i != NumElts; i += 16) {
30992 for (int j = 0; j != 8; ++j) {
30993 LoAmt.push_back(Amt.getOperand(i + j));
30994 HiAmt.push_back(Amt.getOperand(i + j + 8));
30995 }
30996 }
30997
30998 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30999 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31000
31001 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31002 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31003 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31004 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31005 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31006 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31007 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31008 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31009 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31010 }
31011
31012 if (VT == MVT::v16i8 ||
31013 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31014 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31015 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31016
31017 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31018 if (VT.is512BitVector()) {
31019 // On AVX512BW targets we make use of the fact that VSELECT lowers
31020 // to a masked blend which selects bytes based just on the sign bit
31021 // extracted to a mask.
31022 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31023 V0 = DAG.getBitcast(VT, V0);
31024 V1 = DAG.getBitcast(VT, V1);
31025 Sel = DAG.getBitcast(VT, Sel);
31026 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31027 ISD::SETGT);
31028 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31029 } else if (Subtarget.hasSSE41()) {
31030 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31031 // on the sign bit.
31032 V0 = DAG.getBitcast(VT, V0);
31033 V1 = DAG.getBitcast(VT, V1);
31034 Sel = DAG.getBitcast(VT, Sel);
31035 return DAG.getBitcast(SelVT,
31036 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31037 }
31038 // On pre-SSE41 targets we test for the sign bit by comparing to
31039 // zero - a negative value will set all bits of the lanes to true
31040 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31041 SDValue Z = DAG.getConstant(0, dl, SelVT);
31042 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31043 return DAG.getSelect(dl, SelVT, C, V0, V1);
31044 };
31045
31046 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31047 // We can safely do this using i16 shifts as we're only interested in
31048 // the 3 lower bits of each byte.
31049 Amt = DAG.getBitcast(ExtVT, Amt);
31050 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31051 Amt = DAG.getBitcast(VT, Amt);
31052
31053 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31054 // r = VSELECT(r, shift(r, 4), a);
31055 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31056 R = SignBitSelect(VT, Amt, M, R);
31057
31058 // a += a
31059 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31060
31061 // r = VSELECT(r, shift(r, 2), a);
31062 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31063 R = SignBitSelect(VT, Amt, M, R);
31064
31065 // a += a
31066 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31067
31068 // return VSELECT(r, shift(r, 1), a);
31069 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31070 R = SignBitSelect(VT, Amt, M, R);
31071 return R;
31072 }
31073
31074 if (Opc == ISD::SRA) {
31075 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31076 // so we can correctly sign extend. We don't care what happens to the
31077 // lower byte.
31078 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31079 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31080 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31081 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31082 ALo = DAG.getBitcast(ExtVT, ALo);
31083 AHi = DAG.getBitcast(ExtVT, AHi);
31084 RLo = DAG.getBitcast(ExtVT, RLo);
31085 RHi = DAG.getBitcast(ExtVT, RHi);
31086
31087 // r = VSELECT(r, shift(r, 4), a);
31088 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31089 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31090 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31091 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31092
31093 // a += a
31094 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31095 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31096
31097 // r = VSELECT(r, shift(r, 2), a);
31098 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31099 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31100 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31101 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31102
31103 // a += a
31104 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31105 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31106
31107 // r = VSELECT(r, shift(r, 1), a);
31108 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31109 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31110 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31111 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31112
31113 // Logical shift the result back to the lower byte, leaving a zero upper
31114 // byte meaning that we can safely pack with PACKUSWB.
31115 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31116 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31117 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31118 }
31119 }
31120
31121 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31122 MVT ExtVT = MVT::v8i32;
31123 SDValue Z = DAG.getConstant(0, dl, VT);
31124 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31125 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31126 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31127 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31128 ALo = DAG.getBitcast(ExtVT, ALo);
31129 AHi = DAG.getBitcast(ExtVT, AHi);
31130 RLo = DAG.getBitcast(ExtVT, RLo);
31131 RHi = DAG.getBitcast(ExtVT, RHi);
31132 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31133 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31134 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31135 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31136 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31137 }
31138
31139 if (VT == MVT::v8i16) {
31140 // If we have a constant shift amount, the non-SSE41 path is best as
31141 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31142 bool UseSSE41 = Subtarget.hasSSE41() &&
31144
31145 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31146 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31147 // the sign bit.
31148 if (UseSSE41) {
31149 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31150 V0 = DAG.getBitcast(ExtVT, V0);
31151 V1 = DAG.getBitcast(ExtVT, V1);
31152 Sel = DAG.getBitcast(ExtVT, Sel);
31153 return DAG.getBitcast(
31154 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31155 }
31156 // On pre-SSE41 targets we splat the sign bit - a negative value will
31157 // set all bits of the lanes to true and VSELECT uses that in
31158 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31159 SDValue C =
31160 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31161 return DAG.getSelect(dl, VT, C, V0, V1);
31162 };
31163
31164 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31165 if (UseSSE41) {
31166 // On SSE41 targets we need to replicate the shift mask in both
31167 // bytes for PBLENDVB.
31168 Amt = DAG.getNode(
31169 ISD::OR, dl, VT,
31170 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31171 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31172 } else {
31173 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31174 }
31175
31176 // r = VSELECT(r, shift(r, 8), a);
31177 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31178 R = SignBitSelect(Amt, M, R);
31179
31180 // a += a
31181 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31182
31183 // r = VSELECT(r, shift(r, 4), a);
31184 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31185 R = SignBitSelect(Amt, M, R);
31186
31187 // a += a
31188 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31189
31190 // r = VSELECT(r, shift(r, 2), a);
31191 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31192 R = SignBitSelect(Amt, M, R);
31193
31194 // a += a
31195 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31196
31197 // return VSELECT(r, shift(r, 1), a);
31198 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31199 R = SignBitSelect(Amt, M, R);
31200 return R;
31201 }
31202
31203 // Decompose 256-bit shifts into 128-bit shifts.
31204 if (VT.is256BitVector())
31205 return splitVectorIntBinary(Op, DAG, dl);
31206
31207 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31208 return splitVectorIntBinary(Op, DAG, dl);
31209
31210 return SDValue();
31211}
31212
31214 SelectionDAG &DAG) {
31215 MVT VT = Op.getSimpleValueType();
31216 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31217 "Unexpected funnel shift opcode!");
31218
31219 SDLoc DL(Op);
31220 SDValue Op0 = Op.getOperand(0);
31221 SDValue Op1 = Op.getOperand(1);
31222 SDValue Amt = Op.getOperand(2);
31223 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31224 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31225
31226 if (VT.isVector()) {
31227 APInt APIntShiftAmt;
31228 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31229 unsigned NumElts = VT.getVectorNumElements();
31230
31231 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31232 if (IsFSHR)
31233 std::swap(Op0, Op1);
31234
31235 if (IsCstSplat) {
31236 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31237 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31238 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31239 {Op0, Op1, Imm}, DAG, Subtarget);
31240 }
31241 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31242 {Op0, Op1, Amt}, DAG, Subtarget);
31243 }
31244 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31245 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31246 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31247 "Unexpected funnel shift type!");
31248
31249 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31250 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31251 if (IsCstSplat) {
31252 // TODO: Can't use generic expansion as UNDEF amt elements can be
31253 // converted to other values when folded to shift amounts, losing the
31254 // splat.
31255 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31256 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31257 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31258 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31259 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31260
31261 if (EltSizeInBits == 8 &&
31262 (Subtarget.hasXOP() ||
31263 (useVPTERNLOG(Subtarget, VT) &&
31264 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31265 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31266 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31267 // the original vector width to handle cases where we split.
31268 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31269 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31270 SDValue ShX =
31271 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31272 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31273 SDValue ShY =
31274 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31275 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31276 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31277 DAG.getConstant(MaskX, DL, VT));
31278 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31279 DAG.getConstant(MaskY, DL, VT));
31280 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31281 }
31282
31283 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31284 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31285 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31286 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31287 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31288 }
31289
31290 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31291 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31292 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31293
31294 // Constant vXi16 funnel shifts can be efficiently handled by default.
31295 if (IsCst && EltSizeInBits == 16)
31296 return SDValue();
31297
31298 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31299 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31300 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31301
31302 // Split 256-bit integers on XOP/pre-AVX2 targets.
31303 // Split 512-bit integers on non 512-bit BWI targets.
31304 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31305 !Subtarget.hasAVX2())) ||
31306 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31307 EltSizeInBits < 32)) {
31308 // Pre-mask the amount modulo using the wider vector.
31309 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31310 return splitVectorOp(Op, DAG, DL);
31311 }
31312
31313 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31314 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31315 int ScalarAmtIdx = -1;
31316 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31317 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31318 if (EltSizeInBits == 16)
31319 return SDValue();
31320
31321 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31322 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31323 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31324 ScalarAmtIdx, Subtarget, DAG);
31325 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31326 ScalarAmtIdx, Subtarget, DAG);
31327 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31328 }
31329 }
31330
31331 MVT WideSVT = MVT::getIntegerVT(
31332 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31333 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31334
31335 // If per-element shifts are legal, fallback to generic expansion.
31336 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31337 return SDValue();
31338
31339 // Attempt to fold as:
31340 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31341 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31342 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31343 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31344 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31345 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31346 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31347 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31348 EltSizeInBits, DAG);
31349 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31350 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31351 if (!IsFSHR)
31352 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31353 EltSizeInBits, DAG);
31354 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31355 }
31356
31357 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31358 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31359 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31360 SDValue Z = DAG.getConstant(0, DL, VT);
31361 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31362 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31363 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31364 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31365 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31366 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31367 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31368 }
31369
31370 // Fallback to generic expansion.
31371 return SDValue();
31372 }
31373 assert(
31374 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31375 "Unexpected funnel shift type!");
31376
31377 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31378 bool OptForSize = DAG.shouldOptForSize();
31379 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31380
31381 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31382 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31383 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31384 !isa<ConstantSDNode>(Amt)) {
31385 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31386 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31387 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31388 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31389 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31390 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31391 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31392 if (IsFSHR) {
31393 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31394 } else {
31395 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31396 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31397 }
31398 return DAG.getZExtOrTrunc(Res, DL, VT);
31399 }
31400
31401 if (VT == MVT::i8 || ExpandFunnel)
31402 return SDValue();
31403
31404 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31405 if (VT == MVT::i16) {
31406 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31407 DAG.getConstant(15, DL, Amt.getValueType()));
31408 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31409 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31410 }
31411
31412 return Op;
31413}
31414
31415static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31416 SelectionDAG &DAG) {
31417 MVT VT = Op.getSimpleValueType();
31418 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31419
31420 SDLoc DL(Op);
31421 SDValue R = Op.getOperand(0);
31422 SDValue Amt = Op.getOperand(1);
31423 unsigned Opcode = Op.getOpcode();
31424 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31425 int NumElts = VT.getVectorNumElements();
31426 bool IsROTL = Opcode == ISD::ROTL;
31427
31428 // Check for constant splat rotation amount.
31429 APInt CstSplatValue;
31430 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31431
31432 // Check for splat rotate by zero.
31433 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31434 return R;
31435
31436 // AVX512 implicitly uses modulo rotation amounts.
31437 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31438 // Attempt to rotate by immediate.
31439 if (IsCstSplat) {
31440 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31441 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31442 return DAG.getNode(RotOpc, DL, VT, R,
31443 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31444 }
31445
31446 // Else, fall-back on VPROLV/VPRORV.
31447 return Op;
31448 }
31449
31450 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31451 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31452 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31453 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31454 }
31455
31456 SDValue Z = DAG.getConstant(0, DL, VT);
31457
31458 if (!IsROTL) {
31459 // If the ISD::ROTR amount is constant, we're always better converting to
31460 // ISD::ROTL.
31461 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31462 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31463
31464 // XOP targets always prefers ISD::ROTL.
31465 if (Subtarget.hasXOP())
31466 return DAG.getNode(ISD::ROTL, DL, VT, R,
31467 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31468 }
31469
31470 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31471 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31473 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31474 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31475 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31476 DAG.getTargetConstant(0, DL, MVT::i8));
31477 }
31478
31479 // Split 256-bit integers on XOP/pre-AVX2 targets.
31480 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31481 return splitVectorIntBinary(Op, DAG, DL);
31482
31483 // XOP has 128-bit vector variable + immediate rotates.
31484 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31485 // XOP implicitly uses modulo rotation amounts.
31486 if (Subtarget.hasXOP()) {
31487 assert(IsROTL && "Only ROTL expected");
31488 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31489
31490 // Attempt to rotate by immediate.
31491 if (IsCstSplat) {
31492 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31493 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31494 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31495 }
31496
31497 // Use general rotate by variable (per-element).
31498 return Op;
31499 }
31500
31501 // Rotate by an uniform constant - expand back to shifts.
31502 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31503 // to other values when folded to shift amounts, losing the splat.
31504 if (IsCstSplat) {
31505 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31506 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31507 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31508 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31509 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31510 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31511 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31512 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31513 }
31514
31515 // Split 512-bit integers on non 512-bit BWI targets.
31516 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31517 return splitVectorIntBinary(Op, DAG, DL);
31518
31519 assert(
31520 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31521 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31522 Subtarget.hasAVX2()) ||
31523 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31524 "Only vXi32/vXi16/vXi8 vector rotates supported");
31525
31526 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31527 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31528
31529 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31530 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31531
31532 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31533 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31534 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31535 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31536 int BaseRotAmtIdx = -1;
31537 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31538 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31539 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31540 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31541 }
31542 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31543 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31544 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31545 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31546 BaseRotAmtIdx, Subtarget, DAG);
31547 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31548 BaseRotAmtIdx, Subtarget, DAG);
31549 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31550 }
31551 }
31552
31553 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31554 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31555
31556 // Attempt to fold as unpack(x,x) << zext(y):
31557 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31558 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31559 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31560 if (!(ConstantAmt && EltSizeInBits != 8) &&
31561 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31562 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31563 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31564 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31565 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31566 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31567 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31568 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31569 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31570 }
31571
31572 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31573 // the amount bit.
31574 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31575 if (EltSizeInBits == 8) {
31576 MVT WideVT =
31577 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31578
31579 // Attempt to fold as:
31580 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31581 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31582 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31583 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31584 // If we're rotating by constant, just use default promotion.
31585 if (ConstantAmt)
31586 return SDValue();
31587 // See if we can perform this by widening to vXi16 or vXi32.
31588 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31589 R = DAG.getNode(
31590 ISD::OR, DL, WideVT, R,
31591 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31592 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31593 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31594 if (IsROTL)
31595 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31596 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31597 }
31598
31599 // We don't need ModuloAmt here as we just peek at individual bits.
31600 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31601 if (Subtarget.hasSSE41()) {
31602 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31603 // on the sign bit.
31604 V0 = DAG.getBitcast(VT, V0);
31605 V1 = DAG.getBitcast(VT, V1);
31606 Sel = DAG.getBitcast(VT, Sel);
31607 return DAG.getBitcast(SelVT,
31608 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31609 }
31610 // On pre-SSE41 targets we test for the sign bit by comparing to
31611 // zero - a negative value will set all bits of the lanes to true
31612 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31613 SDValue Z = DAG.getConstant(0, DL, SelVT);
31614 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31615 return DAG.getSelect(DL, SelVT, C, V0, V1);
31616 };
31617
31618 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31619 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31620 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31621 IsROTL = true;
31622 }
31623
31624 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31625 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31626
31627 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31628 // We can safely do this using i16 shifts as we're only interested in
31629 // the 3 lower bits of each byte.
31630 Amt = DAG.getBitcast(ExtVT, Amt);
31631 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31632 Amt = DAG.getBitcast(VT, Amt);
31633
31634 // r = VSELECT(r, rot(r, 4), a);
31635 SDValue M;
31636 M = DAG.getNode(
31637 ISD::OR, DL, VT,
31638 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31639 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31640 R = SignBitSelect(VT, Amt, M, R);
31641
31642 // a += a
31643 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31644
31645 // r = VSELECT(r, rot(r, 2), a);
31646 M = DAG.getNode(
31647 ISD::OR, DL, VT,
31648 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31649 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31650 R = SignBitSelect(VT, Amt, M, R);
31651
31652 // a += a
31653 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31654
31655 // return VSELECT(r, rot(r, 1), a);
31656 M = DAG.getNode(
31657 ISD::OR, DL, VT,
31658 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31659 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31660 return SignBitSelect(VT, Amt, M, R);
31661 }
31662
31663 bool IsSplatAmt = DAG.isSplatValue(Amt);
31664 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31665 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31666
31667 // Fallback for splats + all supported variable shifts.
31668 // Fallback for non-constants AVX2 vXi16 as well.
31669 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31670 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31671 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31672 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31673 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31674 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31675 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31676 }
31677
31678 // Everything below assumes ISD::ROTL.
31679 if (!IsROTL) {
31680 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31681 IsROTL = true;
31682 }
31683
31684 // ISD::ROT* uses modulo rotate amounts.
31685 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31686
31687 assert(IsROTL && "Only ROTL supported");
31688
31689 // As with shifts, attempt to convert the rotation amount to a multiplication
31690 // factor, fallback to general expansion.
31691 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31692 if (!Scale)
31693 return SDValue();
31694
31695 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31696 if (EltSizeInBits == 16) {
31697 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31698 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31699 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31700 }
31701
31702 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31703 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31704 // that can then be OR'd with the lower 32-bits.
31705 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31706 static const int OddMask[] = {1, 1, 3, 3};
31707 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31708 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31709
31710 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31711 DAG.getBitcast(MVT::v2i64, R),
31712 DAG.getBitcast(MVT::v2i64, Scale));
31713 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31714 DAG.getBitcast(MVT::v2i64, R13),
31715 DAG.getBitcast(MVT::v2i64, Scale13));
31716 Res02 = DAG.getBitcast(VT, Res02);
31717 Res13 = DAG.getBitcast(VT, Res13);
31718
31719 return DAG.getNode(ISD::OR, DL, VT,
31720 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31721 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31722}
31723
31724/// Returns true if the operand type is exactly twice the native width, and
31725/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31726/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31727/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31728bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31729 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31730
31731 if (OpWidth == 64)
31732 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31733 if (OpWidth == 128)
31734 return Subtarget.canUseCMPXCHG16B();
31735
31736 return false;
31737}
31738
31740X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31741 Type *MemType = SI->getValueOperand()->getType();
31742
31743 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31744 !Subtarget.useSoftFloat()) {
31745 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31746 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31748
31749 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31750 Subtarget.hasAVX())
31752 }
31753
31754 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31756}
31757
31758// Note: this turns large loads into lock cmpxchg8b/16b.
31760X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31761 Type *MemType = LI->getType();
31762
31763 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31764 !Subtarget.useSoftFloat()) {
31765 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31766 // can use movq to do the load. If we have X87 we can load into an 80-bit
31767 // X87 register and store it to a stack temporary.
31768 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31769 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31771
31772 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31773 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31774 Subtarget.hasAVX())
31776 }
31777
31778 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31780}
31781
31789
31790static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31791 using namespace llvm::PatternMatch;
31792 BitTestKind BTK = UndefBit;
31793 if (auto *C = dyn_cast<ConstantInt>(V)) {
31794 // Check if V is a power of 2 or NOT power of 2.
31795 if (isPowerOf2_64(C->getZExtValue()))
31796 BTK = ConstantBit;
31797 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31798 BTK = NotConstantBit;
31799 return {V, BTK};
31800 }
31801
31802 // Check if V is some power of 2 pattern known to be non-zero
31803 if (auto *I = dyn_cast<Instruction>(V)) {
31804 bool Not = false;
31805 // Check if we have a NOT
31806 Value *PeekI;
31807 if (match(I, m_Not(m_Value(PeekI))) ||
31808 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31809 Not = true;
31810 I = dyn_cast<Instruction>(PeekI);
31811
31812 // If I is constant, it will fold and we can evaluate later. If its an
31813 // argument or something of that nature, we can't analyze.
31814 if (I == nullptr)
31815 return {nullptr, UndefBit};
31816 }
31817 // We can only use 1 << X without more sophisticated analysis. C << X where
31818 // C is a power of 2 but not 1 can result in zero which cannot be translated
31819 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31820 if (I->getOpcode() == Instruction::Shl) {
31821 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31822 // -X` and some other provable power of 2 patterns that we can use CTZ on
31823 // may be profitable.
31824 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31825 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31826 // be provably a non-zero power of 2.
31827 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31828 // transformable to bittest.
31829 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31830 if (!ShiftVal)
31831 return {nullptr, UndefBit};
31832 if (ShiftVal->equalsInt(1))
31833 BTK = Not ? NotShiftBit : ShiftBit;
31834
31835 if (BTK == UndefBit)
31836 return {nullptr, UndefBit};
31837
31838 Value *BitV = I->getOperand(1);
31839
31840 // Read past a shiftmask instruction to find count
31841 Value *AndOp;
31842 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31843 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31844 BitV = AndOp;
31845
31846 return {BitV, BTK};
31847 }
31848 }
31849 return {nullptr, UndefBit};
31850}
31851
31853X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31854 using namespace llvm::PatternMatch;
31855 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31856 // prefix to a normal instruction for these operations.
31857 if (AI->use_empty())
31859
31860 if (AI->getOperation() == AtomicRMWInst::Xor) {
31861 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31862 // preferable to both `cmpxchg` and `btc`.
31863 if (match(AI->getOperand(1), m_SignMask()))
31865 }
31866
31867 // If the atomicrmw's result is used by a single bit AND, we may use
31868 // bts/btr/btc instruction for these operations.
31869 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31870 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31871 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31872 // detect it.
31873 Instruction *I = AI->user_back();
31874 auto BitChange = FindSingleBitChange(AI->getValOperand());
31875 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31876 I->getOpcode() != Instruction::And ||
31877 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31878 AI->getParent() != I->getParent())
31880
31881 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31882
31883 // This is a redundant AND, it should get cleaned up elsewhere.
31884 if (AI == I->getOperand(OtherIdx))
31886
31887 // The following instruction must be a AND single bit.
31888 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31889 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31890 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31891 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31893 }
31894 if (AI->getOperation() == AtomicRMWInst::And) {
31895 return ~C1->getValue() == C2->getValue()
31898 }
31901 }
31902
31903 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31904
31905 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31906 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31908
31909 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31910
31911 // If shift amounts are not the same we can't use BitTestIntrinsic.
31912 if (BitChange.first != BitTested.first)
31914
31915 // If atomic AND need to be masking all be one bit and testing the one bit
31916 // unset in the mask.
31917 if (AI->getOperation() == AtomicRMWInst::And)
31918 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31921
31922 // If atomic XOR/OR need to be setting and testing the same bit.
31923 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31926}
31927
31928void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31929 IRBuilder<> Builder(AI);
31930 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31933 switch (AI->getOperation()) {
31934 default:
31935 llvm_unreachable("Unknown atomic operation");
31936 case AtomicRMWInst::Or:
31937 IID_C = Intrinsic::x86_atomic_bts;
31938 IID_I = Intrinsic::x86_atomic_bts_rm;
31939 break;
31940 case AtomicRMWInst::Xor:
31941 IID_C = Intrinsic::x86_atomic_btc;
31942 IID_I = Intrinsic::x86_atomic_btc_rm;
31943 break;
31944 case AtomicRMWInst::And:
31945 IID_C = Intrinsic::x86_atomic_btr;
31946 IID_I = Intrinsic::x86_atomic_btr_rm;
31947 break;
31948 }
31949 Instruction *I = AI->user_back();
31950 LLVMContext &Ctx = AI->getContext();
31951 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31953 Value *Result = nullptr;
31954 auto BitTested = FindSingleBitChange(AI->getValOperand());
31955 assert(BitTested.first != nullptr);
31956
31957 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31958 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31959
31960 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31961 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31962 {Addr, Builder.getInt8(Imm)});
31963 } else {
31964 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31965
31966 Value *SI = BitTested.first;
31967 assert(SI != nullptr);
31968
31969 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31970 // mask it.
31971 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31972 Value *BitPos =
31973 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31974 // Todo(1): In many cases it may be provable that SI is less than
31975 // ShiftBits in which case this mask is unnecessary
31976 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31977 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31978 // favor of just a raw BT{S|R|C}.
31979
31980 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31981 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31982
31983 // If the result is only used for zero/non-zero status then we don't need to
31984 // shift value back. Otherwise do so.
31985 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31986 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31987 if (ICmp->isEquality()) {
31988 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31989 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31990 if (C0 || C1) {
31991 assert(C0 == nullptr || C1 == nullptr);
31992 if ((C0 ? C0 : C1)->isZero())
31993 continue;
31994 }
31995 }
31996 }
31997 Result = Builder.CreateShl(Result, BitPos);
31998 break;
31999 }
32000 }
32001
32002 I->replaceAllUsesWith(Result);
32003 I->eraseFromParent();
32004 AI->eraseFromParent();
32005}
32006
32008 using namespace llvm::PatternMatch;
32009 if (!AI->hasOneUse())
32010 return false;
32011
32012 Value *Op = AI->getOperand(1);
32013 CmpPredicate Pred;
32014 Instruction *I = AI->user_back();
32016 if (Opc == AtomicRMWInst::Add) {
32017 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32018 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32019 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32020 if (match(I->user_back(),
32022 return true;
32023 if (match(I->user_back(),
32025 return true;
32026 }
32027 return false;
32028 }
32029 if (Opc == AtomicRMWInst::Sub) {
32030 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32031 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32032 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32033 if (match(I->user_back(),
32035 return true;
32036 if (match(I->user_back(),
32038 return true;
32039 }
32040 return false;
32041 }
32042 if ((Opc == AtomicRMWInst::Or &&
32044 (Opc == AtomicRMWInst::And &&
32046 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32047 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32048 Pred == CmpInst::ICMP_SLT;
32049 if (match(I->user_back(),
32051 return true;
32052 return false;
32053 }
32054 if (Opc == AtomicRMWInst::Xor) {
32055 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32056 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32057 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32058 if (match(I->user_back(),
32060 return true;
32061 if (match(I->user_back(),
32063 return true;
32064 }
32065 return false;
32066 }
32067
32068 return false;
32069}
32070
32071void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32072 AtomicRMWInst *AI) const {
32073 IRBuilder<> Builder(AI);
32074 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32075 Instruction *TempI = nullptr;
32076 LLVMContext &Ctx = AI->getContext();
32077 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32078 if (!ICI) {
32079 TempI = AI->user_back();
32080 assert(TempI->hasOneUse() && "Must have one use");
32081 ICI = cast<ICmpInst>(TempI->user_back());
32082 }
32084 ICmpInst::Predicate Pred = ICI->getPredicate();
32085 switch (Pred) {
32086 default:
32087 llvm_unreachable("Not supported Pred");
32088 case CmpInst::ICMP_EQ:
32089 CC = X86::COND_E;
32090 break;
32091 case CmpInst::ICMP_NE:
32092 CC = X86::COND_NE;
32093 break;
32094 case CmpInst::ICMP_SLT:
32095 CC = X86::COND_S;
32096 break;
32097 case CmpInst::ICMP_SGT:
32098 CC = X86::COND_NS;
32099 break;
32100 }
32102 switch (AI->getOperation()) {
32103 default:
32104 llvm_unreachable("Unknown atomic operation");
32105 case AtomicRMWInst::Add:
32106 IID = Intrinsic::x86_atomic_add_cc;
32107 break;
32108 case AtomicRMWInst::Sub:
32109 IID = Intrinsic::x86_atomic_sub_cc;
32110 break;
32111 case AtomicRMWInst::Or:
32112 IID = Intrinsic::x86_atomic_or_cc;
32113 break;
32114 case AtomicRMWInst::And:
32115 IID = Intrinsic::x86_atomic_and_cc;
32116 break;
32117 case AtomicRMWInst::Xor:
32118 IID = Intrinsic::x86_atomic_xor_cc;
32119 break;
32120 }
32121 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32123 Value *Call = Builder.CreateIntrinsic(
32124 IID, AI->getType(),
32125 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32126 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32127 ICI->replaceAllUsesWith(Result);
32128 ICI->eraseFromParent();
32129 if (TempI)
32130 TempI->eraseFromParent();
32131 AI->eraseFromParent();
32132}
32133
32135X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32136 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32137 Type *MemType = AI->getType();
32138
32139 // If the operand is too big, we must see if cmpxchg8/16b is available
32140 // and default to library calls otherwise.
32141 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32142 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32144 }
32145
32147 switch (Op) {
32150 case AtomicRMWInst::Add:
32151 case AtomicRMWInst::Sub:
32154 // It's better to use xadd, xsub or xchg for these in other cases.
32156 case AtomicRMWInst::Or:
32157 case AtomicRMWInst::And:
32158 case AtomicRMWInst::Xor:
32161 return shouldExpandLogicAtomicRMWInIR(AI);
32163 case AtomicRMWInst::Max:
32164 case AtomicRMWInst::Min:
32175 default:
32176 // These always require a non-trivial set of data operations on x86. We must
32177 // use a cmpxchg loop.
32179 }
32180}
32181
32182LoadInst *
32183X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32184 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32185 Type *MemType = AI->getType();
32186 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32187 // there is no benefit in turning such RMWs into loads, and it is actually
32188 // harmful as it introduces a mfence.
32189 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32190 return nullptr;
32191
32192 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32193 // lowering available in lowerAtomicArith.
32194 // TODO: push more cases through this path.
32195 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32196 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32197 AI->use_empty())
32198 return nullptr;
32199
32200 IRBuilder<> Builder(AI);
32201 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32202 auto SSID = AI->getSyncScopeID();
32203 // We must restrict the ordering to avoid generating loads with Release or
32204 // ReleaseAcquire orderings.
32206
32207 // Before the load we need a fence. Here is an example lifted from
32208 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32209 // is required:
32210 // Thread 0:
32211 // x.store(1, relaxed);
32212 // r1 = y.fetch_add(0, release);
32213 // Thread 1:
32214 // y.fetch_add(42, acquire);
32215 // r2 = x.load(relaxed);
32216 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32217 // lowered to just a load without a fence. A mfence flushes the store buffer,
32218 // making the optimization clearly correct.
32219 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32220 // otherwise, we might be able to be more aggressive on relaxed idempotent
32221 // rmw. In practice, they do not look useful, so we don't try to be
32222 // especially clever.
32223
32224 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32225 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32226 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32227
32228 // Finally we can emit the atomic load.
32229 LoadInst *Loaded = Builder.CreateAlignedLoad(
32230 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32231 Loaded->setAtomic(Order, SSID);
32232 AI->replaceAllUsesWith(Loaded);
32233 AI->eraseFromParent();
32234 return Loaded;
32235}
32236
32237/// Emit a locked operation on a stack location which does not change any
32238/// memory location, but does involve a lock prefix. Location is chosen to be
32239/// a) very likely accessed only by a single thread to minimize cache traffic,
32240/// and b) definitely dereferenceable. Returns the new Chain result.
32242 const X86Subtarget &Subtarget, SDValue Chain,
32243 const SDLoc &DL) {
32244 // Implementation notes:
32245 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32246 // operations issued by the current processor. As such, the location
32247 // referenced is not relevant for the ordering properties of the instruction.
32248 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32249 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32250 // 2) Using an immediate operand appears to be the best encoding choice
32251 // here since it doesn't require an extra register.
32252 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32253 // is small enough it might just be measurement noise.)
32254 // 4) When choosing offsets, there are several contributing factors:
32255 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32256 // line aligned stack object to improve this case.)
32257 // b) To minimize our chances of introducing a false dependence, we prefer
32258 // to offset the stack usage from TOS slightly.
32259 // c) To minimize concerns about cross thread stack usage - in particular,
32260 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32261 // captures state in the TOS frame and accesses it from many threads -
32262 // we want to use an offset such that the offset is in a distinct cache
32263 // line from the TOS frame.
32264 //
32265 // For a general discussion of the tradeoffs and benchmark results, see:
32266 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32267
32268 auto &MF = DAG.getMachineFunction();
32269 auto &TFL = *Subtarget.getFrameLowering();
32270 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32271
32272 if (Subtarget.is64Bit()) {
32273 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32274 SDValue Ops[] = {
32275 DAG.getRegister(X86::RSP, MVT::i64), // Base
32276 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32277 DAG.getRegister(0, MVT::i64), // Index
32278 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32279 DAG.getRegister(0, MVT::i16), // Segment.
32280 Zero,
32281 Chain};
32282 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32283 MVT::Other, Ops);
32284 return SDValue(Res, 1);
32285 }
32286
32287 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32288 SDValue Ops[] = {
32289 DAG.getRegister(X86::ESP, MVT::i32), // Base
32290 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32291 DAG.getRegister(0, MVT::i32), // Index
32292 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32293 DAG.getRegister(0, MVT::i16), // Segment.
32294 Zero,
32295 Chain
32296 };
32297 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32298 MVT::Other, Ops);
32299 return SDValue(Res, 1);
32300}
32301
32303 SelectionDAG &DAG) {
32304 SDLoc dl(Op);
32305 AtomicOrdering FenceOrdering =
32306 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32307 SyncScope::ID FenceSSID =
32308 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32309
32310 // The only fence that needs an instruction is a sequentially-consistent
32311 // cross-thread fence.
32312 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32313 FenceSSID == SyncScope::System) {
32314 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32315 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32316
32317 SDValue Chain = Op.getOperand(0);
32318 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32319 }
32320
32321 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32322 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32323}
32324
32326 SelectionDAG &DAG) {
32327 MVT T = Op.getSimpleValueType();
32328 SDLoc DL(Op);
32329 unsigned Reg = 0;
32330 unsigned size = 0;
32331 switch(T.SimpleTy) {
32332 default: llvm_unreachable("Invalid value type!");
32333 case MVT::i8: Reg = X86::AL; size = 1; break;
32334 case MVT::i16: Reg = X86::AX; size = 2; break;
32335 case MVT::i32: Reg = X86::EAX; size = 4; break;
32336 case MVT::i64:
32337 assert(Subtarget.is64Bit() && "Node not type legal!");
32338 Reg = X86::RAX; size = 8;
32339 break;
32340 }
32341 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32342 Op.getOperand(2), SDValue());
32343 SDValue Ops[] = { cpIn.getValue(0),
32344 Op.getOperand(1),
32345 Op.getOperand(3),
32346 DAG.getTargetConstant(size, DL, MVT::i8),
32347 cpIn.getValue(1) };
32348 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32349 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32351 Ops, T, MMO);
32352
32353 SDValue cpOut =
32354 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32355 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32356 MVT::i32, cpOut.getValue(2));
32357 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32358
32359 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32360 cpOut, Success, EFLAGS.getValue(1));
32361}
32362
32363// Create MOVMSKB, taking into account whether we need to split for AVX1.
32365 const X86Subtarget &Subtarget) {
32366 MVT InVT = V.getSimpleValueType();
32367
32368 if (InVT == MVT::v64i8) {
32369 SDValue Lo, Hi;
32370 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32371 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32372 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32373 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32374 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32375 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32376 DAG.getConstant(32, DL, MVT::i8));
32377 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32378 }
32379 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32380 SDValue Lo, Hi;
32381 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32382 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32383 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32384 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32385 DAG.getConstant(16, DL, MVT::i8));
32386 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32387 }
32388
32389 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32390}
32391
32392static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32393 SelectionDAG &DAG) {
32394 SDValue Src = Op.getOperand(0);
32395 MVT SrcVT = Src.getSimpleValueType();
32396 MVT DstVT = Op.getSimpleValueType();
32397
32398 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32399 // half to v32i1 and concatenating the result.
32400 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32401 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32402 assert(Subtarget.hasBWI() && "Expected BWI target");
32403 SDLoc dl(Op);
32404 SDValue Lo, Hi;
32405 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32406 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32407 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32408 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32409 }
32410
32411 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32412 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32413 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32414 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32415 SDLoc DL(Op);
32416 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32417 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32418 return DAG.getZExtOrTrunc(V, DL, DstVT);
32419 }
32420
32421 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32422 SrcVT == MVT::i64) && "Unexpected VT!");
32423
32424 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32425 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32426 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32427 // This conversion needs to be expanded.
32428 return SDValue();
32429
32430 SDLoc dl(Op);
32431 if (SrcVT.isVector()) {
32432 // Widen the vector in input in the case of MVT::v2i32.
32433 // Example: from MVT::v2i32 to MVT::v4i32.
32435 SrcVT.getVectorNumElements() * 2);
32436 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32437 DAG.getUNDEF(SrcVT));
32438 } else {
32439 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32440 "Unexpected source type in LowerBITCAST");
32441 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32442 }
32443
32444 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32445 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32446
32447 if (DstVT == MVT::x86mmx)
32448 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32449
32450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32451 DAG.getVectorIdxConstant(0, dl));
32452}
32453
32454/// Compute the horizontal sum of bytes in V for the elements of VT.
32455///
32456/// Requires V to be a byte vector and VT to be an integer vector type with
32457/// wider elements than V's type. The width of the elements of VT determines
32458/// how many bytes of V are summed horizontally to produce each element of the
32459/// result.
32461 const X86Subtarget &Subtarget,
32462 SelectionDAG &DAG) {
32463 SDLoc DL(V);
32464 MVT ByteVecVT = V.getSimpleValueType();
32465 MVT EltVT = VT.getVectorElementType();
32466 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32467 "Expected value to have byte element type.");
32468 assert(EltVT != MVT::i8 &&
32469 "Horizontal byte sum only makes sense for wider elements!");
32470 unsigned VecSize = VT.getSizeInBits();
32471 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32472
32473 // PSADBW instruction horizontally add all bytes and leave the result in i64
32474 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32475 if (EltVT == MVT::i64) {
32476 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32477 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32478 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32479 return DAG.getBitcast(VT, V);
32480 }
32481
32482 if (EltVT == MVT::i32) {
32483 // We unpack the low half and high half into i32s interleaved with zeros so
32484 // that we can use PSADBW to horizontally sum them. The most useful part of
32485 // this is that it lines up the results of two PSADBW instructions to be
32486 // two v2i64 vectors which concatenated are the 4 population counts. We can
32487 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32488 SDValue Zeros = DAG.getConstant(0, DL, VT);
32489 SDValue V32 = DAG.getBitcast(VT, V);
32490 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32491 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32492
32493 // Do the horizontal sums into two v2i64s.
32494 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32495 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32496 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32497 DAG.getBitcast(ByteVecVT, Low), Zeros);
32498 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32499 DAG.getBitcast(ByteVecVT, High), Zeros);
32500
32501 // Merge them together.
32502 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32503 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32504 DAG.getBitcast(ShortVecVT, Low),
32505 DAG.getBitcast(ShortVecVT, High));
32506
32507 return DAG.getBitcast(VT, V);
32508 }
32509
32510 // The only element type left is i16.
32511 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32512
32513 // To obtain pop count for each i16 element starting from the pop count for
32514 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32515 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32516 // directly supported.
32517 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32518 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32519 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32520 DAG.getBitcast(ByteVecVT, V));
32521 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32522}
32523
32525 const X86Subtarget &Subtarget,
32526 SelectionDAG &DAG) {
32527 MVT VT = Op.getSimpleValueType();
32528 MVT EltVT = VT.getVectorElementType();
32529 int NumElts = VT.getVectorNumElements();
32530 (void)EltVT;
32531 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32532
32533 // Implement a lookup table in register by using an algorithm based on:
32534 // http://wm.ite.pl/articles/sse-popcount.html
32535 //
32536 // The general idea is that every lower byte nibble in the input vector is an
32537 // index into a in-register pre-computed pop count table. We then split up the
32538 // input vector in two new ones: (1) a vector with only the shifted-right
32539 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32540 // masked out higher ones) for each byte. PSHUFB is used separately with both
32541 // to index the in-register table. Next, both are added and the result is a
32542 // i8 vector where each element contains the pop count for input byte.
32543 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32544 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32545 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32546 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32547
32549 for (int i = 0; i < NumElts; ++i)
32550 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32551 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32552 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32553
32554 // High nibbles
32555 SDValue FourV = DAG.getConstant(4, DL, VT);
32556 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32557
32558 // Low nibbles
32559 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32560
32561 // The input vector is used as the shuffle mask that index elements into the
32562 // LUT. After counting low and high nibbles, add the vector to obtain the
32563 // final pop count per i8 element.
32564 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32565 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32566 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32567}
32568
32569// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32570// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32572 const X86Subtarget &Subtarget,
32573 SelectionDAG &DAG) {
32574 MVT VT = Op.getSimpleValueType();
32575 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32576 "Unknown CTPOP type to handle");
32577 SDValue Op0 = Op.getOperand(0);
32578
32579 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32580 if (Subtarget.hasVPOPCNTDQ()) {
32581 unsigned NumElems = VT.getVectorNumElements();
32582 assert((VT.getVectorElementType() == MVT::i8 ||
32583 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32584 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32585 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32586 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32587 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32588 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32589 }
32590 }
32591
32592 // Decompose 256-bit ops into smaller 128-bit ops.
32593 if (VT.is256BitVector() && !Subtarget.hasInt256())
32594 return splitVectorIntUnary(Op, DAG, DL);
32595
32596 // Decompose 512-bit ops into smaller 256-bit ops.
32597 if (VT.is512BitVector() && !Subtarget.hasBWI())
32598 return splitVectorIntUnary(Op, DAG, DL);
32599
32600 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32601 if (VT.getScalarType() != MVT::i8) {
32602 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32603 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32604 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32605 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32606 }
32607
32608 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32609 if (!Subtarget.hasSSSE3())
32610 return SDValue();
32611
32612 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32613}
32614
32615static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32616 SelectionDAG &DAG) {
32617 MVT VT = N.getSimpleValueType();
32618 SDValue Op = N.getOperand(0);
32619 SDLoc DL(N);
32620
32621 if (VT.isScalarInteger()) {
32622 // Compute the lower/upper bounds of the active bits of the value,
32623 // allowing us to shift the active bits down if necessary to fit into the
32624 // special cases below.
32625 KnownBits Known = DAG.computeKnownBits(Op);
32626 if (Known.isConstant())
32627 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32628 unsigned LZ = Known.countMinLeadingZeros();
32629 unsigned TZ = Known.countMinTrailingZeros();
32630 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32631 unsigned ActiveBits = Known.getBitWidth() - LZ;
32632 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32633
32634 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32635 if (ShiftedActiveBits <= 2) {
32636 if (ActiveBits > 2)
32637 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32638 DAG.getShiftAmountConstant(TZ, VT, DL));
32639 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32640 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32641 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32642 DAG.getShiftAmountConstant(1, VT, DL)));
32643 return DAG.getZExtOrTrunc(Op, DL, VT);
32644 }
32645
32646 // i3 CTPOP - perform LUT into i32 integer.
32647 if (ShiftedActiveBits <= 3) {
32648 if (ActiveBits > 3)
32649 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32650 DAG.getShiftAmountConstant(TZ, VT, DL));
32651 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32652 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32653 DAG.getShiftAmountConstant(1, VT, DL));
32654 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32655 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32656 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32657 DAG.getConstant(0x3, DL, MVT::i32));
32658 return DAG.getZExtOrTrunc(Op, DL, VT);
32659 }
32660
32661 // i4 CTPOP - perform LUT into i64 integer.
32662 if (ShiftedActiveBits <= 4 &&
32663 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32664 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32665 if (ActiveBits > 4)
32666 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32667 DAG.getShiftAmountConstant(TZ, VT, DL));
32668 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32669 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32670 DAG.getConstant(4, DL, MVT::i32));
32671 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32672 DAG.getShiftAmountOperand(MVT::i64, Op));
32673 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32674 DAG.getConstant(0x7, DL, MVT::i64));
32675 return DAG.getZExtOrTrunc(Op, DL, VT);
32676 }
32677
32678 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32679 if (ShiftedActiveBits <= 8) {
32680 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32681 if (ActiveBits > 8)
32682 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32683 DAG.getShiftAmountConstant(TZ, VT, DL));
32684 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32685 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32686 DAG.getConstant(0x08040201U, DL, MVT::i32));
32687 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32688 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32689 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32690 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32691 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32692 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32693 return DAG.getZExtOrTrunc(Op, DL, VT);
32694 }
32695
32696 return SDValue(); // fallback to generic expansion.
32697 }
32698
32699 assert(VT.isVector() &&
32700 "We only do custom lowering for vector population count.");
32701 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32702}
32703
32705 MVT VT = Op.getSimpleValueType();
32706 SDValue In = Op.getOperand(0);
32707 SDLoc DL(Op);
32708
32709 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32710 // perform the BITREVERSE.
32711 if (!VT.isVector()) {
32712 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32713 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32714 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32715 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32716 DAG.getVectorIdxConstant(0, DL));
32717 }
32718
32719 int NumElts = VT.getVectorNumElements();
32720 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32721
32722 // Decompose 256-bit ops into smaller 128-bit ops.
32723 if (VT.is256BitVector())
32724 return splitVectorIntUnary(Op, DAG, DL);
32725
32726 assert(VT.is128BitVector() &&
32727 "Only 128-bit vector bitreverse lowering supported.");
32728
32729 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32730 // perform the BSWAP in the shuffle.
32731 // Its best to shuffle using the second operand as this will implicitly allow
32732 // memory folding for multiple vectors.
32733 SmallVector<SDValue, 16> MaskElts;
32734 for (int i = 0; i != NumElts; ++i) {
32735 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32736 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32737 int PermuteByte = SourceByte | (2 << 5);
32738 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32739 }
32740 }
32741
32742 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32743 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32744 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32745 Res, Mask);
32746 return DAG.getBitcast(VT, Res);
32747}
32748
32750 SelectionDAG &DAG) {
32751 MVT VT = Op.getSimpleValueType();
32752
32753 if (Subtarget.hasXOP() && !VT.is512BitVector())
32754 return LowerBITREVERSE_XOP(Op, DAG);
32755
32756 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32757 "SSSE3 or GFNI required for BITREVERSE");
32758
32759 SDValue In = Op.getOperand(0);
32760 SDLoc DL(Op);
32761
32762 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32763 if (VT.is512BitVector() && !Subtarget.hasBWI())
32764 return splitVectorIntUnary(Op, DAG, DL);
32765
32766 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32767 if (VT.is256BitVector() && !Subtarget.hasInt256())
32768 return splitVectorIntUnary(Op, DAG, DL);
32769
32770 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32771 if (!VT.isVector()) {
32772 assert(
32773 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32774 "Only tested for i8/i16/i32/i64");
32775 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32776 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32777 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32778 DAG.getBitcast(MVT::v16i8, Res));
32779 Res =
32780 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32781 DAG.getVectorIdxConstant(0, DL));
32782 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32783 }
32784
32785 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32786
32787 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32788 if (VT.getScalarType() != MVT::i8) {
32789 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32790 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32791 Res = DAG.getBitcast(ByteVT, Res);
32792 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32793 return DAG.getBitcast(VT, Res);
32794 }
32795 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32796 "Only byte vector BITREVERSE supported");
32797
32798 unsigned NumElts = VT.getVectorNumElements();
32799
32800 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32801 if (Subtarget.hasGFNI()) {
32803 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32804 DAG.getTargetConstant(0, DL, MVT::i8));
32805 }
32806
32807 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32808 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32809 // 0-15 value (moved to the other nibble).
32810 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32811 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32812 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32813
32814 const int LoLUT[16] = {
32815 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32816 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32817 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32818 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32819 const int HiLUT[16] = {
32820 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32821 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32822 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32823 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32824
32825 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32826 for (unsigned i = 0; i < NumElts; ++i) {
32827 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32828 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32829 }
32830
32831 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32832 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32833 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32834 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32835 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32836}
32837
32838static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32839 SelectionDAG &DAG) {
32840 SDLoc DL(Op);
32841 SDValue X = Op.getOperand(0);
32842 MVT VT = Op.getSimpleValueType();
32843
32844 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32845 if (VT == MVT::i8 ||
32847 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32848 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32849 DAG.getConstant(0, DL, MVT::i8));
32850 // Copy the inverse of the parity flag into a register with setcc.
32851 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32852 // Extend to the original type.
32853 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32854 }
32855
32856 // If we have POPCNT, use the default expansion.
32857 if (Subtarget.hasPOPCNT())
32858 return SDValue();
32859
32860 if (VT == MVT::i64) {
32861 // Xor the high and low 16-bits together using a 32-bit operation.
32862 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32863 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32864 DAG.getConstant(32, DL, MVT::i8)));
32865 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32866 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32867 }
32868
32869 if (VT != MVT::i16) {
32870 // Xor the high and low 16-bits together using a 32-bit operation.
32871 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32872 DAG.getConstant(16, DL, MVT::i8));
32873 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32874 } else {
32875 // If the input is 16-bits, we need to extend to use an i32 shift below.
32876 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32877 }
32878
32879 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32880 // This should allow an h-reg to be used to save a shift.
32881 SDValue Hi = DAG.getNode(
32882 ISD::TRUNCATE, DL, MVT::i8,
32883 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32884 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32885 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32886 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32887
32888 // Copy the inverse of the parity flag into a register with setcc.
32889 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32890 // Extend to the original type.
32891 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32892}
32893
32895 const X86Subtarget &Subtarget) {
32896 unsigned NewOpc = 0;
32897 switch (N->getOpcode()) {
32898 case ISD::ATOMIC_LOAD_ADD:
32899 NewOpc = X86ISD::LADD;
32900 break;
32901 case ISD::ATOMIC_LOAD_SUB:
32902 NewOpc = X86ISD::LSUB;
32903 break;
32904 case ISD::ATOMIC_LOAD_OR:
32905 NewOpc = X86ISD::LOR;
32906 break;
32907 case ISD::ATOMIC_LOAD_XOR:
32908 NewOpc = X86ISD::LXOR;
32909 break;
32910 case ISD::ATOMIC_LOAD_AND:
32911 NewOpc = X86ISD::LAND;
32912 break;
32913 default:
32914 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32915 }
32916
32917 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32918
32919 return DAG.getMemIntrinsicNode(
32920 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32921 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32922 /*MemVT=*/N->getSimpleValueType(0), MMO);
32923}
32924
32925/// Lower atomic_load_ops into LOCK-prefixed operations.
32927 const X86Subtarget &Subtarget) {
32928 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32929 SDValue Chain = N->getOperand(0);
32930 SDValue LHS = N->getOperand(1);
32931 SDValue RHS = N->getOperand(2);
32932 unsigned Opc = N->getOpcode();
32933 MVT VT = N->getSimpleValueType(0);
32934 SDLoc DL(N);
32935
32936 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32937 // can only be lowered when the result is unused. They should have already
32938 // been transformed into a cmpxchg loop in AtomicExpand.
32939 if (N->hasAnyUseOfValue(0)) {
32940 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32941 // select LXADD if LOCK_SUB can't be selected.
32942 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32943 // can use LXADD as opposed to cmpxchg.
32944 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32945 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32946 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32947 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32948
32949 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32950 "Used AtomicRMW ops other than Add should have been expanded!");
32951 return N;
32952 }
32953
32954 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32955 // The core idea here is that since the memory location isn't actually
32956 // changing, all we need is a lowering for the *ordering* impacts of the
32957 // atomicrmw. As such, we can chose a different operation and memory
32958 // location to minimize impact on other code.
32959 // The above holds unless the node is marked volatile in which
32960 // case it needs to be preserved according to the langref.
32961 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32962 // On X86, the only ordering which actually requires an instruction is
32963 // seq_cst which isn't SingleThread, everything just needs to be preserved
32964 // during codegen and then dropped. Note that we expect (but don't assume),
32965 // that orderings other than seq_cst and acq_rel have been canonicalized to
32966 // a store or load.
32969 // Prefer a locked operation against a stack location to minimize cache
32970 // traffic. This assumes that stack locations are very likely to be
32971 // accessed only by the owning thread.
32972 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32973 assert(!N->hasAnyUseOfValue(0));
32974 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32975 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32976 DAG.getUNDEF(VT), NewChain);
32977 }
32978 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32979 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32980 assert(!N->hasAnyUseOfValue(0));
32981 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32982 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32983 DAG.getUNDEF(VT), NewChain);
32984 }
32985
32986 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32987 // RAUW the chain, but don't worry about the result, as it's unused.
32988 assert(!N->hasAnyUseOfValue(0));
32989 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32990 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32991 DAG.getUNDEF(VT), LockOp.getValue(1));
32992}
32993
32995 const X86Subtarget &Subtarget) {
32996 auto *Node = cast<AtomicSDNode>(Op.getNode());
32997 SDLoc dl(Node);
32998 EVT VT = Node->getMemoryVT();
32999
33000 bool IsSeqCst =
33001 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33002 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33003
33004 // If this store is not sequentially consistent and the type is legal
33005 // we can just keep it.
33006 if (!IsSeqCst && IsTypeLegal)
33007 return Op;
33008
33009 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
33011 Attribute::NoImplicitFloat)) {
33012 SDValue Chain;
33013 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33014 // vector store.
33015 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33016 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33017 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33018 Node->getMemOperand());
33019 }
33020
33021 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33022 // is enabled.
33023 if (VT == MVT::i64) {
33024 if (Subtarget.hasSSE1()) {
33025 SDValue SclToVec =
33026 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33027 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33028 SclToVec = DAG.getBitcast(StVT, SclToVec);
33029 SDVTList Tys = DAG.getVTList(MVT::Other);
33030 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33031 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33032 MVT::i64, Node->getMemOperand());
33033 } else if (Subtarget.hasX87()) {
33034 // First load this into an 80-bit X87 register using a stack temporary.
33035 // This will put the whole integer into the significand.
33036 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33037 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33038 MachinePointerInfo MPI =
33040 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33042 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33043 SDValue LdOps[] = {Chain, StackPtr};
33045 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33046 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33047 Chain = Value.getValue(1);
33048
33049 // Now use an FIST to do the atomic store.
33050 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33051 Chain =
33052 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33053 StoreOps, MVT::i64, Node->getMemOperand());
33054 }
33055 }
33056
33057 if (Chain) {
33058 // If this is a sequentially consistent store, also emit an appropriate
33059 // barrier.
33060 if (IsSeqCst)
33061 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33062
33063 return Chain;
33064 }
33065 }
33066
33067 // Convert seq_cst store -> xchg
33068 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33069 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33070 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33071 Node->getOperand(0), Node->getOperand(2),
33072 Node->getOperand(1), Node->getMemOperand());
33073 return Swap.getValue(1);
33074}
33075
33077 SDNode *N = Op.getNode();
33078 MVT VT = N->getSimpleValueType(0);
33079 unsigned Opc = Op.getOpcode();
33080
33081 // Let legalize expand this if it isn't a legal type yet.
33082 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33083 return SDValue();
33084
33085 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33086 SDLoc DL(N);
33087
33088 // Set the carry flag.
33089 SDValue Carry = Op.getOperand(2);
33090 EVT CarryVT = Carry.getValueType();
33091 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33092 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33093
33094 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33095 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33096 Op.getOperand(0), Op.getOperand(1),
33097 Carry.getValue(1));
33098
33099 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33100 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33101 Sum.getValue(1), DL, DAG);
33102 if (N->getValueType(1) == MVT::i1)
33103 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33104
33105 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33106}
33107
33108static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33109 SelectionDAG &DAG) {
33110 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33111
33112 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33113 // which returns the values as { float, float } (in XMM0) or
33114 // { double, double } (which is returned in XMM0, XMM1).
33115 SDLoc dl(Op);
33116 SDValue Arg = Op.getOperand(0);
33117 EVT ArgVT = Arg.getValueType();
33118 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33119
33121 Args.emplace_back(Arg, ArgTy);
33122
33123 bool isF64 = ArgVT == MVT::f64;
33124 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33125 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33126 // the results are returned via SRet in memory.
33127 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33128 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33129 const char *LibcallName = TLI.getLibcallName(LC);
33130 SDValue Callee =
33131 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33132
33133 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33134 : (Type *)FixedVectorType::get(ArgTy, 4);
33135
33137 CLI.setDebugLoc(dl)
33138 .setChain(DAG.getEntryNode())
33139 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33140
33141 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33142
33143 if (isF64)
33144 // Returned in xmm0 and xmm1.
33145 return CallResult.first;
33146
33147 // Returned in bits 0:31 and 32:64 xmm0.
33148 SDValue SinVal =
33149 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33150 DAG.getVectorIdxConstant(0, dl));
33151 SDValue CosVal =
33152 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33153 DAG.getVectorIdxConstant(1, dl));
33154 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33155 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33156}
33157
33158/// Widen a vector input to a vector of NVT. The
33159/// input vector must have the same element type as NVT.
33161 bool FillWithZeroes = false) {
33162 // Check if InOp already has the right width.
33163 MVT InVT = InOp.getSimpleValueType();
33164 if (InVT == NVT)
33165 return InOp;
33166
33167 if (InOp.isUndef())
33168 return DAG.getUNDEF(NVT);
33169
33171 "input and widen element type must match");
33172
33173 unsigned InNumElts = InVT.getVectorNumElements();
33174 unsigned WidenNumElts = NVT.getVectorNumElements();
33175 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33176 "Unexpected request for vector widening");
33177
33178 SDLoc dl(InOp);
33179 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33180 SDValue N1 = InOp.getOperand(1);
33181 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33182 N1.isUndef()) {
33183 InOp = InOp.getOperand(0);
33184 InVT = InOp.getSimpleValueType();
33185 InNumElts = InVT.getVectorNumElements();
33186 }
33187 }
33190 EVT EltVT = InOp.getOperand(0).getValueType();
33191 SDValue FillVal =
33192 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33194 Ops.append(WidenNumElts - InNumElts, FillVal);
33195 return DAG.getBuildVector(NVT, dl, Ops);
33196 }
33197 SDValue FillVal =
33198 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33199 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33200 DAG.getVectorIdxConstant(0, dl));
33201}
33202
33204 SelectionDAG &DAG) {
33205 assert(Subtarget.hasAVX512() &&
33206 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33207
33209 SDValue Src = N->getValue();
33210 MVT VT = Src.getSimpleValueType();
33211 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33212 SDLoc dl(Op);
33213
33214 SDValue Scale = N->getScale();
33215 SDValue Index = N->getIndex();
33216 SDValue Mask = N->getMask();
33217 SDValue Chain = N->getChain();
33218 SDValue BasePtr = N->getBasePtr();
33219
33220 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33221 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33222 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33223 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33225 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33226 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33227 SDVTList VTs = DAG.getVTList(MVT::Other);
33228 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33229 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33230 N->getMemoryVT(), N->getMemOperand());
33231 }
33232 return SDValue();
33233 }
33234
33235 MVT IndexVT = Index.getSimpleValueType();
33236
33237 // If the index is v2i32, we're being called by type legalization and we
33238 // should just let the default handling take care of it.
33239 if (IndexVT == MVT::v2i32)
33240 return SDValue();
33241
33242 // If we don't have VLX and neither the passthru or index is 512-bits, we
33243 // need to widen until one is.
33244 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33245 !Index.getSimpleValueType().is512BitVector()) {
33246 // Determine how much we need to widen by to get a 512-bit type.
33247 unsigned Factor = std::min(512/VT.getSizeInBits(),
33248 512/IndexVT.getSizeInBits());
33249 unsigned NumElts = VT.getVectorNumElements() * Factor;
33250
33251 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33252 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33253 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33254
33255 Src = ExtendToType(Src, VT, DAG);
33256 Index = ExtendToType(Index, IndexVT, DAG);
33257 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33258 }
33259
33260 SDVTList VTs = DAG.getVTList(MVT::Other);
33261 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33262 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33263 N->getMemoryVT(), N->getMemOperand());
33264}
33265
33266static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33267 SelectionDAG &DAG) {
33268
33270 MVT VT = Op.getSimpleValueType();
33271 MVT ScalarVT = VT.getScalarType();
33272 SDValue Mask = N->getMask();
33273 MVT MaskVT = Mask.getSimpleValueType();
33274 SDValue PassThru = N->getPassThru();
33275 SDLoc dl(Op);
33276
33277 // Handle AVX masked loads which don't support passthru other than 0.
33278 if (MaskVT.getVectorElementType() != MVT::i1) {
33279 // We also allow undef in the isel pattern.
33280 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33281 return Op;
33282
33283 SDValue NewLoad = DAG.getMaskedLoad(
33284 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33285 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33286 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33287 N->isExpandingLoad());
33288 // Emit a blend.
33289 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33290 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33291 }
33292
33293 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33294 "Expanding masked load is supported on AVX-512 target only!");
33295
33296 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33297 "Expanding masked load is supported for 32 and 64-bit types only!");
33298
33299 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33300 "Cannot lower masked load op.");
33301
33302 assert((ScalarVT.getSizeInBits() >= 32 ||
33303 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33304 ScalarVT == MVT::f16))) &&
33305 "Unsupported masked load op.");
33306
33307 // This operation is legal for targets with VLX, but without
33308 // VLX the vector should be widened to 512 bit
33309 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33310 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33311 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33312
33313 // Mask element has to be i1.
33314 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33315 "Unexpected mask type");
33316
33317 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33318
33319 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33320 SDValue NewLoad = DAG.getMaskedLoad(
33321 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33322 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33323 N->getExtensionType(), N->isExpandingLoad());
33324
33325 SDValue Extract =
33326 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33327 DAG.getVectorIdxConstant(0, dl));
33328 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33329 return DAG.getMergeValues(RetOps, dl);
33330}
33331
33332static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33333 SelectionDAG &DAG) {
33335 SDValue DataToStore = N->getValue();
33336 MVT VT = DataToStore.getSimpleValueType();
33337 MVT ScalarVT = VT.getScalarType();
33338 SDValue Mask = N->getMask();
33339 SDLoc dl(Op);
33340
33341 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33342 "Expanding masked load is supported on AVX-512 target only!");
33343
33344 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33345 "Expanding masked load is supported for 32 and 64-bit types only!");
33346
33347 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33348 "Cannot lower masked store op.");
33349
33350 assert((ScalarVT.getSizeInBits() >= 32 ||
33351 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33352 ScalarVT == MVT::f16))) &&
33353 "Unsupported masked store op.");
33354
33355 // This operation is legal for targets with VLX, but without
33356 // VLX the vector should be widened to 512 bit
33357 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33358 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33359
33360 // Mask element has to be i1.
33361 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33362 "Unexpected mask type");
33363
33364 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33365
33366 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33367 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33368 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33369 N->getOffset(), Mask, N->getMemoryVT(),
33370 N->getMemOperand(), N->getAddressingMode(),
33371 N->isTruncatingStore(), N->isCompressingStore());
33372}
33373
33374static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33375 SelectionDAG &DAG) {
33376 assert(Subtarget.hasAVX2() &&
33377 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33378
33380 SDLoc dl(Op);
33381 MVT VT = Op.getSimpleValueType();
33382 SDValue Index = N->getIndex();
33383 SDValue Mask = N->getMask();
33384 SDValue PassThru = N->getPassThru();
33385 MVT IndexVT = Index.getSimpleValueType();
33386
33387 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33388
33389 // If the index is v2i32, we're being called by type legalization.
33390 if (IndexVT == MVT::v2i32)
33391 return SDValue();
33392
33393 // If we don't have VLX and neither the passthru or index is 512-bits, we
33394 // need to widen until one is.
33395 MVT OrigVT = VT;
33396 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33397 !IndexVT.is512BitVector()) {
33398 // Determine how much we need to widen by to get a 512-bit type.
33399 unsigned Factor = std::min(512/VT.getSizeInBits(),
33400 512/IndexVT.getSizeInBits());
33401
33402 unsigned NumElts = VT.getVectorNumElements() * Factor;
33403
33404 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33405 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33406 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33407
33408 PassThru = ExtendToType(PassThru, VT, DAG);
33409 Index = ExtendToType(Index, IndexVT, DAG);
33410 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33411 }
33412
33413 // Break dependency on the data register.
33414 if (PassThru.isUndef())
33415 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33416
33417 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33418 N->getScale() };
33419 SDValue NewGather = DAG.getMemIntrinsicNode(
33420 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33421 N->getMemOperand());
33422 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33423 DAG.getVectorIdxConstant(0, dl));
33424 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33425}
33426
33428 SDLoc dl(Op);
33429 SDValue Src = Op.getOperand(0);
33430 MVT DstVT = Op.getSimpleValueType();
33431
33433 unsigned SrcAS = N->getSrcAddressSpace();
33434
33435 assert(SrcAS != N->getDestAddressSpace() &&
33436 "addrspacecast must be between different address spaces");
33437
33438 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33439 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33440 } else if (DstVT == MVT::i64) {
33441 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33442 } else if (DstVT == MVT::i32) {
33443 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33444 } else {
33445 report_fatal_error("Bad address space in addrspacecast");
33446 }
33447 return Op;
33448}
33449
33450SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33451 SelectionDAG &DAG) const {
33452 // TODO: Eventually, the lowering of these nodes should be informed by or
33453 // deferred to the GC strategy for the function in which they appear. For
33454 // now, however, they must be lowered to something. Since they are logically
33455 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33456 // require special handling for these nodes), lower them as literal NOOPs for
33457 // the time being.
33459 Ops.push_back(Op.getOperand(0));
33460 if (Op->getGluedNode())
33461 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33462
33463 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33464 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33465}
33466
33467// Custom split CVTPS2PH with wide types.
33469 SDLoc dl(Op);
33470 EVT VT = Op.getValueType();
33471 SDValue Lo, Hi;
33472 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33473 EVT LoVT, HiVT;
33474 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33475 SDValue RC = Op.getOperand(1);
33476 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33477 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33478 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33479}
33480
33482 SelectionDAG &DAG) {
33483 unsigned IsData = Op.getConstantOperandVal(4);
33484
33485 // We don't support non-data prefetch without PREFETCHI.
33486 // Just preserve the chain.
33487 if (!IsData && !Subtarget.hasPREFETCHI())
33488 return Op.getOperand(0);
33489
33490 return Op;
33491}
33492
33494 SDNode *N = Op.getNode();
33495 SDValue Operand = N->getOperand(0);
33496 EVT VT = Operand.getValueType();
33497 SDLoc dl(N);
33498
33499 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33500
33501 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33502 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33503 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33504 // promote this operator's result!
33505 SDValue Chain = DAG.getEntryNode();
33506 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33507 {Chain, Operand, One});
33508 return StrictFmul;
33509}
33510
33512 unsigned OpNo) {
33513 const APInt Operand(32, OpNo);
33514 std::string OpNoStr = llvm::toString(Operand, 10, false);
33515 std::string Str(" $");
33516
33517 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33518 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33519
33520 auto I = StringRef::npos;
33521 for (auto &AsmStr : AsmStrs) {
33522 // Match the OpNo string. We should match exactly to exclude match
33523 // sub-string, e.g. "$12" contain "$1"
33524 if (AsmStr.ends_with(OpNoStr1))
33525 I = AsmStr.size() - OpNoStr1.size();
33526
33527 // Get the index of operand in AsmStr.
33528 if (I == StringRef::npos)
33529 I = AsmStr.find(OpNoStr1 + ",");
33530 if (I == StringRef::npos)
33531 I = AsmStr.find(OpNoStr2);
33532
33533 if (I == StringRef::npos)
33534 continue;
33535
33536 assert(I > 0 && "Unexpected inline asm string!");
33537 // Remove the operand string and label (if exsit).
33538 // For example:
33539 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33540 // ==>
33541 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33542 // ==>
33543 // "call dword ptr "
33544 auto TmpStr = AsmStr.substr(0, I);
33545 I = TmpStr.rfind(':');
33546 if (I != StringRef::npos)
33547 TmpStr = TmpStr.substr(I + 1);
33548 return TmpStr.take_while(llvm::isAlpha);
33549 }
33550
33551 return StringRef();
33552}
33553
33555 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33556 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33557 // changed from indirect TargetLowering::C_Memory to direct
33558 // TargetLowering::C_Address.
33559 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33560 // location.
33561 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33562 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33563}
33564
33566 SDValue Mask) {
33567 EVT Ty = MVT::i8;
33568 auto V = DAG.getBitcast(MVT::i1, Mask);
33569 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33570 auto Zero = DAG.getConstant(0, DL, Ty);
33571 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33572 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33573 return SDValue(CmpZero.getNode(), 1);
33574}
33575
33577 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33578 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33579 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33580 // ->
33581 // _, flags = SUB 0, mask
33582 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33583 // bit_cast_to_vector<res>
33584 EVT VTy = PassThru.getValueType();
33585 EVT Ty = VTy.getVectorElementType();
33586 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33587 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33588 : DAG.getBitcast(Ty, PassThru);
33589 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33590 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33591 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33592 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33593 return DAG.getBitcast(VTy, NewLoad);
33594}
33595
33597 SDValue Chain,
33599 SDValue Val, SDValue Mask) const {
33600 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33601 // ->
33602 // _, flags = SUB 0, mask
33603 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33605 SDVTList Tys = DAG.getVTList(MVT::Other);
33606 auto ScalarVal = DAG.getBitcast(Ty, Val);
33607 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33608 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33609 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33610 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33611}
33612
33613/// Provide custom lowering hooks for some operations.
33615 switch (Op.getOpcode()) {
33616 // clang-format off
33617 default: llvm_unreachable("Should not custom lower this!");
33618 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33619 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33620 return LowerCMP_SWAP(Op, Subtarget, DAG);
33621 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33622 case ISD::ATOMIC_LOAD_ADD:
33623 case ISD::ATOMIC_LOAD_SUB:
33624 case ISD::ATOMIC_LOAD_OR:
33625 case ISD::ATOMIC_LOAD_XOR:
33626 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33627 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33628 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33629 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33630 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33631 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33632 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33633 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33634 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33635 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33636 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33637 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33638 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33639 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33640 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33641 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33642 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33643 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33644 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33645 case ISD::SHL_PARTS:
33646 case ISD::SRA_PARTS:
33647 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33648 case ISD::FSHL:
33649 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33650 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33652 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33654 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33655 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33656 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33657 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33658 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33661 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33662 case ISD::FP_TO_SINT:
33664 case ISD::FP_TO_UINT:
33665 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33667 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33668 case ISD::FP_EXTEND:
33669 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33670 case ISD::FP_ROUND:
33671 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33672 case ISD::FP16_TO_FP:
33673 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33674 case ISD::FP_TO_FP16:
33675 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33676 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33677 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33678 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33679 case ISD::FADD:
33680 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33681 case ISD::FROUND: return LowerFROUND(Op, DAG);
33682 case ISD::FABS:
33683 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33684 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33685 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33686 case ISD::LRINT:
33687 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33688 case ISD::SETCC:
33689 case ISD::STRICT_FSETCC:
33690 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33691 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33692 case ISD::SELECT: return LowerSELECT(Op, DAG);
33693 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33694 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33695 case ISD::VASTART: return LowerVASTART(Op, DAG);
33696 case ISD::VAARG: return LowerVAARG(Op, DAG);
33697 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33698 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33700 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33701 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33702 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33703 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33705 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33706 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33707 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33708 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33709 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33711 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33712 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33713 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33714 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33715 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33716 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33717 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33718 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33719 case ISD::CTLZ:
33720 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33721 case ISD::CTTZ:
33722 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33723 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33724 case ISD::MULHS:
33725 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33726 case ISD::ROTL:
33727 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33728 case ISD::SRA:
33729 case ISD::SRL:
33730 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33731 case ISD::SADDO:
33732 case ISD::UADDO:
33733 case ISD::SSUBO:
33734 case ISD::USUBO: return LowerXALUO(Op, DAG);
33735 case ISD::SMULO:
33736 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33737 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33738 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33739 case ISD::SADDO_CARRY:
33740 case ISD::SSUBO_CARRY:
33741 case ISD::UADDO_CARRY:
33742 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33743 case ISD::ADD:
33744 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33745 case ISD::UADDSAT:
33746 case ISD::SADDSAT:
33747 case ISD::USUBSAT:
33748 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33749 case ISD::SMAX:
33750 case ISD::SMIN:
33751 case ISD::UMAX:
33752 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33753 case ISD::FMINIMUM:
33754 case ISD::FMAXIMUM:
33755 case ISD::FMINIMUMNUM:
33756 case ISD::FMAXIMUMNUM:
33757 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33758 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33759 case ISD::ABDS:
33760 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33761 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33762 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33763 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33764 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33765 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33766 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33767 case ISD::GC_TRANSITION_START:
33768 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33769 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33770 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33771 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33772 // clang-format on
33773 }
33774}
33775
33776/// Replace a node with an illegal result type with a new node built out of
33777/// custom code.
33780 SelectionDAG &DAG) const {
33781 SDLoc dl(N);
33782 unsigned Opc = N->getOpcode();
33783 switch (Opc) {
33784 default:
33785#ifndef NDEBUG
33786 dbgs() << "ReplaceNodeResults: ";
33787 N->dump(&DAG);
33788#endif
33789 llvm_unreachable("Do not know how to custom type legalize this operation!");
33790 case X86ISD::CVTPH2PS: {
33791 EVT VT = N->getValueType(0);
33792 SDValue Lo, Hi;
33793 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33794 EVT LoVT, HiVT;
33795 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33796 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33797 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33798 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33799 Results.push_back(Res);
33800 return;
33801 }
33803 EVT VT = N->getValueType(0);
33804 SDValue Lo, Hi;
33805 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33806 EVT LoVT, HiVT;
33807 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33808 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33809 {N->getOperand(0), Lo});
33810 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33811 {N->getOperand(0), Hi});
33812 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33813 Lo.getValue(1), Hi.getValue(1));
33814 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33815 Results.push_back(Res);
33816 Results.push_back(Chain);
33817 return;
33818 }
33819 case X86ISD::CVTPS2PH:
33820 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33821 return;
33822 case ISD::CTPOP: {
33823 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33824 // If we have at most 32 active bits, then perform as i32 CTPOP.
33825 // TODO: Perform this in generic legalizer?
33826 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33827 unsigned LZ = Known.countMinLeadingZeros();
33828 unsigned TZ = Known.countMinTrailingZeros();
33829 if ((LZ + TZ) >= 32) {
33830 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33831 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33832 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33833 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33834 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33835 Results.push_back(Op);
33836 return;
33837 }
33838 // Use a v2i64 if possible.
33839 bool NoImplicitFloatOps =
33841 Attribute::NoImplicitFloat);
33842 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33843 SDValue Wide =
33844 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33845 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33846 // Bit count should fit in 32-bits, extract it as that and then zero
33847 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33848 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33849 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33850 DAG.getVectorIdxConstant(0, dl));
33851 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33852 Results.push_back(Wide);
33853 }
33854 return;
33855 }
33856 case ISD::MUL: {
33857 EVT VT = N->getValueType(0);
33859 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33860 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33861 // elements are needed.
33862 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33863 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33864 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33865 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33866 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33867 unsigned NumConcats = 16 / VT.getVectorNumElements();
33868 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33869 ConcatOps[0] = Res;
33870 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33871 Results.push_back(Res);
33872 return;
33873 }
33874 case ISD::SMULO:
33875 case ISD::UMULO: {
33876 EVT VT = N->getValueType(0);
33878 VT == MVT::v2i32 && "Unexpected VT!");
33879 bool IsSigned = Opc == ISD::SMULO;
33880 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33881 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33882 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33883 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33884 // Extract the high 32 bits from each result using PSHUFD.
33885 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33886 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33887 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33888 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33889 DAG.getVectorIdxConstant(0, dl));
33890
33891 // Truncate the low bits of the result. This will become PSHUFD.
33892 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33893
33894 SDValue HiCmp;
33895 if (IsSigned) {
33896 // SMULO overflows if the high bits don't match the sign of the low.
33897 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33898 } else {
33899 // UMULO overflows if the high bits are non-zero.
33900 HiCmp = DAG.getConstant(0, dl, VT);
33901 }
33902 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33903
33904 // Widen the result with by padding with undef.
33905 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33906 DAG.getUNDEF(VT));
33907 Results.push_back(Res);
33908 Results.push_back(Ovf);
33909 return;
33910 }
33911 case X86ISD::VPMADDWD: {
33912 // Legalize types for X86ISD::VPMADDWD by widening.
33913 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33914
33915 EVT VT = N->getValueType(0);
33916 EVT InVT = N->getOperand(0).getValueType();
33917 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33918 "Expected a VT that divides into 128 bits.");
33920 "Unexpected type action!");
33921 unsigned NumConcat = 128 / InVT.getSizeInBits();
33922
33923 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33924 InVT.getVectorElementType(),
33925 NumConcat * InVT.getVectorNumElements());
33926 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33928 NumConcat * VT.getVectorNumElements());
33929
33930 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33931 Ops[0] = N->getOperand(0);
33932 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33933 Ops[0] = N->getOperand(1);
33934 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33935
33936 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33937 Results.push_back(Res);
33938 return;
33939 }
33940 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33941 case X86ISD::FMINC:
33942 case X86ISD::FMIN:
33943 case X86ISD::FMAXC:
33944 case X86ISD::FMAX:
33946 case X86ISD::STRICT_FMAX: {
33947 EVT VT = N->getValueType(0);
33948 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33949 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33950 SDValue UNDEF = DAG.getUNDEF(VT);
33951 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33952 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33953 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33954 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33955 SDValue Res;
33956 if (IsStrict)
33957 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33958 {N->getOperand(0), LHS, RHS});
33959 else
33960 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33961 Results.push_back(Res);
33962 if (IsStrict)
33963 Results.push_back(Res.getValue(1));
33964 return;
33965 }
33966 case ISD::SDIV:
33967 case ISD::UDIV:
33968 case ISD::SREM:
33969 case ISD::UREM: {
33970 EVT VT = N->getValueType(0);
33971 if (VT.isVector()) {
33973 "Unexpected type action!");
33974 // If this RHS is a constant splat vector we can widen this and let
33975 // division/remainder by constant optimize it.
33976 // TODO: Can we do something for non-splat?
33977 APInt SplatVal;
33978 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33979 unsigned NumConcats = 128 / VT.getSizeInBits();
33980 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33981 Ops0[0] = N->getOperand(0);
33982 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33983 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33984 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33985 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33986 Results.push_back(Res);
33987 }
33988 return;
33989 }
33990
33991 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33992 Results.push_back(V);
33993 return;
33994 }
33995 case ISD::TRUNCATE: {
33996 MVT VT = N->getSimpleValueType(0);
33997 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33998 return;
33999
34000 // The generic legalizer will try to widen the input type to the same
34001 // number of elements as the widened result type. But this isn't always
34002 // the best thing so do some custom legalization to avoid some cases.
34003 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34004 SDValue In = N->getOperand(0);
34005 EVT InVT = In.getValueType();
34006 EVT InEltVT = InVT.getVectorElementType();
34007 EVT EltVT = VT.getVectorElementType();
34008 unsigned MinElts = VT.getVectorNumElements();
34009 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34010 unsigned InBits = InVT.getSizeInBits();
34011
34012 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
34013 unsigned PackOpcode;
34014 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34015 Subtarget, N->getFlags())) {
34016 if (SDValue Res =
34017 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34018 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34019 Results.push_back(Res);
34020 return;
34021 }
34022 }
34023
34024 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34025 // 128 bit and smaller inputs should avoid truncate all together and
34026 // use a shuffle.
34027 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34028 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34029 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34030 for (unsigned I = 0; I < MinElts; ++I)
34031 TruncMask[I] = Scale * I;
34032 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34033 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34034 "Illegal vector type in truncation");
34035 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34036 Results.push_back(
34037 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34038 return;
34039 }
34040 }
34041
34042 // With AVX512 there are some cases that can use a target specific
34043 // truncate node to go from 256/512 to less than 128 with zeros in the
34044 // upper elements of the 128 bit result.
34045 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34046 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34047 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34048 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34049 return;
34050 }
34051 // There's one case we can widen to 512 bits and use VTRUNC.
34052 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34053 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34054 DAG.getUNDEF(MVT::v4i64));
34055 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34056 return;
34057 }
34058 }
34059 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34060 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34061 isTypeLegal(MVT::v4i64)) {
34062 // Input needs to be split and output needs to widened. Let's use two
34063 // VTRUNCs, and shuffle their results together into the wider type.
34064 SDValue Lo, Hi;
34065 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34066
34067 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34068 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34069 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34070 { 0, 1, 2, 3, 16, 17, 18, 19,
34071 -1, -1, -1, -1, -1, -1, -1, -1 });
34072 Results.push_back(Res);
34073 return;
34074 }
34075
34076 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34077 // this via type legalization.
34078 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34079 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34080 (!Subtarget.hasSSSE3() ||
34081 (!isTypeLegal(InVT) &&
34082 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34083 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34084 InEltVT.getSizeInBits() * WidenNumElts);
34085 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34086 return;
34087 }
34088
34089 return;
34090 }
34091 case ISD::ANY_EXTEND:
34092 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34093 // It's intended to custom handle the input type.
34094 assert(N->getValueType(0) == MVT::v8i8 &&
34095 "Do not know how to legalize this Node");
34096 return;
34097 case ISD::SIGN_EXTEND:
34098 case ISD::ZERO_EXTEND: {
34099 EVT VT = N->getValueType(0);
34100 SDValue In = N->getOperand(0);
34101 EVT InVT = In.getValueType();
34102 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34103 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34105 "Unexpected type action!");
34106 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34107 // Custom split this so we can extend i8/i16->i32 invec. This is better
34108 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34109 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34110 // we allow the sra from the extend to i32 to be shared by the split.
34111 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34112
34113 // Fill a vector with sign bits for each element.
34114 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34115 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34116
34117 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34118 // to v2i64.
34119 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34120 {0, 4, 1, 5});
34121 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34122 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34123 {2, 6, 3, 7});
34124 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34125
34126 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34127 Results.push_back(Res);
34128 return;
34129 }
34130
34131 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34132 if (!InVT.is128BitVector()) {
34133 // Not a 128 bit vector, but maybe type legalization will promote
34134 // it to 128 bits.
34135 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34136 return;
34137 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34138 if (!InVT.is128BitVector())
34139 return;
34140
34141 // Promote the input to 128 bits. Type legalization will turn this into
34142 // zext_inreg/sext_inreg.
34143 In = DAG.getNode(Opc, dl, InVT, In);
34144 }
34145
34146 // Perform custom splitting instead of the two stage extend we would get
34147 // by default.
34148 EVT LoVT, HiVT;
34149 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34150 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34151
34152 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34153
34154 // We need to shift the input over by half the number of elements.
34155 unsigned NumElts = InVT.getVectorNumElements();
34156 unsigned HalfNumElts = NumElts / 2;
34157 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34158 for (unsigned i = 0; i != HalfNumElts; ++i)
34159 ShufMask[i] = i + HalfNumElts;
34160
34161 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34162 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34163
34164 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34165 Results.push_back(Res);
34166 }
34167 return;
34168 }
34170 case ISD::FP_TO_UINT_SAT: {
34171 if (!Subtarget.hasAVX10_2())
34172 return;
34173
34174 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34175 EVT VT = N->getValueType(0);
34176 SDValue Op = N->getOperand(0);
34177 EVT OpVT = Op.getValueType();
34178 SDValue Res;
34179
34180 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34181 if (IsSigned)
34182 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34183 else
34184 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34185 Results.push_back(Res);
34186 }
34187 return;
34188 }
34189 case ISD::FP_TO_SINT:
34191 case ISD::FP_TO_UINT:
34193 bool IsStrict = N->isStrictFPOpcode();
34194 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34195 EVT VT = N->getValueType(0);
34196 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34197 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34198 EVT SrcVT = Src.getValueType();
34199
34200 SDValue Res;
34201 if (isSoftF16(SrcVT, Subtarget)) {
34202 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34203 if (IsStrict) {
34204 Res =
34205 DAG.getNode(Opc, dl, {VT, MVT::Other},
34206 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34207 {NVT, MVT::Other}, {Chain, Src})});
34208 Chain = Res.getValue(1);
34209 } else {
34210 Res =
34211 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34212 }
34213 Results.push_back(Res);
34214 if (IsStrict)
34215 Results.push_back(Chain);
34216
34217 return;
34218 }
34219
34220 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34221 SrcVT.getVectorElementType() == MVT::f16) {
34222 EVT EleVT = VT.getVectorElementType();
34223 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34224
34225 if (SrcVT != MVT::v8f16) {
34226 SDValue Tmp =
34227 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34228 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34229 Ops[0] = Src;
34230 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34231 }
34232
34233 if (IsStrict) {
34235 Res =
34236 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34237 Chain = Res.getValue(1);
34238 } else {
34239 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34240 Res = DAG.getNode(Opc, dl, ResVT, Src);
34241 }
34242
34243 // TODO: Need to add exception check code for strict FP.
34244 if (EleVT.getSizeInBits() < 16) {
34245 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34246 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34247
34248 // Now widen to 128 bits.
34249 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34250 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34251 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34252 ConcatOps[0] = Res;
34253 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34254 }
34255
34256 Results.push_back(Res);
34257 if (IsStrict)
34258 Results.push_back(Chain);
34259
34260 return;
34261 }
34262
34263 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34265 "Unexpected type action!");
34266
34267 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34268 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34269 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34271 SDValue Res;
34272 SDValue Chain;
34273 if (IsStrict) {
34274 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34275 {N->getOperand(0), Src});
34276 Chain = Res.getValue(1);
34277 } else
34278 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34279
34280 // Preserve what we know about the size of the original result. If the
34281 // result is v2i32, we have to manually widen the assert.
34282 if (PromoteVT == MVT::v2i32)
34283 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34284 DAG.getUNDEF(MVT::v2i32));
34285
34286 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34287 Res.getValueType(), Res,
34289
34290 if (PromoteVT == MVT::v2i32)
34291 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34292 DAG.getVectorIdxConstant(0, dl));
34293
34294 // Truncate back to the original width.
34295 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34296
34297 // Now widen to 128 bits.
34298 unsigned NumConcats = 128 / VT.getSizeInBits();
34300 VT.getVectorNumElements() * NumConcats);
34301 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34302 ConcatOps[0] = Res;
34303 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34304 Results.push_back(Res);
34305 if (IsStrict)
34306 Results.push_back(Chain);
34307 return;
34308 }
34309
34310
34311 if (VT == MVT::v2i32) {
34312 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34313 "Strict unsigned conversion requires AVX512");
34314 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34316 "Unexpected type action!");
34317 if (Src.getValueType() == MVT::v2f64) {
34318 if (!IsSigned && !Subtarget.hasAVX512()) {
34319 SDValue Res =
34320 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34321 Results.push_back(Res);
34322 return;
34323 }
34324
34325 if (IsStrict)
34327 else
34328 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34329
34330 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34331 if (!IsSigned && !Subtarget.hasVLX()) {
34332 // Otherwise we can defer to the generic legalizer which will widen
34333 // the input as well. This will be further widened during op
34334 // legalization to v8i32<-v8f64.
34335 // For strict nodes we'll need to widen ourselves.
34336 // FIXME: Fix the type legalizer to safely widen strict nodes?
34337 if (!IsStrict)
34338 return;
34339 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34340 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34341 Opc = N->getOpcode();
34342 }
34343 SDValue Res;
34344 SDValue Chain;
34345 if (IsStrict) {
34346 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34347 {N->getOperand(0), Src});
34348 Chain = Res.getValue(1);
34349 } else {
34350 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34351 }
34352 Results.push_back(Res);
34353 if (IsStrict)
34354 Results.push_back(Chain);
34355 return;
34356 }
34357
34358 // Custom widen strict v2f32->v2i32 by padding with zeros.
34359 // FIXME: Should generic type legalizer do this?
34360 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34361 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34362 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34363 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34364 {N->getOperand(0), Src});
34365 Results.push_back(Res);
34366 Results.push_back(Res.getValue(1));
34367 return;
34368 }
34369
34370 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34371 // so early out here.
34372 return;
34373 }
34374
34375 assert(!VT.isVector() && "Vectors should have been handled above!");
34376
34377 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34378 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34379 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34380 assert(!Subtarget.is64Bit() && "i64 should be legal");
34381 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34382 // If we use a 128-bit result we might need to use a target specific node.
34383 unsigned SrcElts =
34384 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34385 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34386 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34387 if (NumElts != SrcElts) {
34388 if (IsStrict)
34390 else
34391 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34392 }
34393
34394 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34395 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34396 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34397 ZeroIdx);
34398 SDValue Chain;
34399 if (IsStrict) {
34400 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34401 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34402 Chain = Res.getValue(1);
34403 } else
34404 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34405 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34406 Results.push_back(Res);
34407 if (IsStrict)
34408 Results.push_back(Chain);
34409 return;
34410 }
34411
34412 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34413 SDValue Chain;
34414 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34415 Results.push_back(V);
34416 if (IsStrict)
34417 Results.push_back(Chain);
34418 return;
34419 }
34420
34421 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34422 Results.push_back(V);
34423 if (IsStrict)
34424 Results.push_back(Chain);
34425 }
34426 return;
34427 }
34428 case ISD::LRINT:
34429 if (N->getValueType(0) == MVT::v2i32) {
34430 SDValue Src = N->getOperand(0);
34431 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34432 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34433 DAG.getUNDEF(MVT::v2f16));
34434 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34435 DAG.getUNDEF(MVT::v4f16));
34436 } else if (Src.getValueType() != MVT::v2f64) {
34437 return;
34438 }
34439 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34440 return;
34441 }
34442 [[fallthrough]];
34443 case ISD::LLRINT: {
34444 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34445 Results.push_back(V);
34446 return;
34447 }
34448
34449 case ISD::SINT_TO_FP:
34451 case ISD::UINT_TO_FP:
34453 bool IsStrict = N->isStrictFPOpcode();
34454 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34455 EVT VT = N->getValueType(0);
34456 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34457 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34458 Subtarget.hasVLX()) {
34459 if (Src.getValueType().getVectorElementType() == MVT::i16)
34460 return;
34461
34462 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34463 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34464 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34465 : DAG.getUNDEF(MVT::v2i32));
34466 if (IsStrict) {
34467 unsigned Opc =
34469 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34470 {N->getOperand(0), Src});
34471 Results.push_back(Res);
34472 Results.push_back(Res.getValue(1));
34473 } else {
34474 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34475 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34476 }
34477 return;
34478 }
34479 if (VT != MVT::v2f32)
34480 return;
34481 EVT SrcVT = Src.getValueType();
34482 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34483 if (IsStrict) {
34484 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34486 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34487 {N->getOperand(0), Src});
34488 Results.push_back(Res);
34489 Results.push_back(Res.getValue(1));
34490 } else {
34491 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34492 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34493 }
34494 return;
34495 }
34496 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34497 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34498 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34499 SDValue One = DAG.getConstant(1, dl, SrcVT);
34500 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34501 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34502 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34503 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34504 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34505 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34506 for (int i = 0; i != 2; ++i) {
34507 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34508 SignSrc, DAG.getVectorIdxConstant(i, dl));
34509 if (IsStrict)
34510 SignCvts[i] =
34511 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34512 {N->getOperand(0), Elt});
34513 else
34514 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34515 };
34516 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34517 SDValue Slow, Chain;
34518 if (IsStrict) {
34519 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34520 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34521 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34522 {Chain, SignCvt, SignCvt});
34523 Chain = Slow.getValue(1);
34524 } else {
34525 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34526 }
34527 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34528 IsNeg =
34529 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34530 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34531 Results.push_back(Cvt);
34532 if (IsStrict)
34533 Results.push_back(Chain);
34534 return;
34535 }
34536
34537 if (SrcVT != MVT::v2i32)
34538 return;
34539
34540 if (IsSigned || Subtarget.hasAVX512()) {
34541 if (!IsStrict)
34542 return;
34543
34544 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34545 // FIXME: Should generic type legalizer do this?
34546 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34547 DAG.getConstant(0, dl, MVT::v2i32));
34548 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34549 {N->getOperand(0), Src});
34550 Results.push_back(Res);
34551 Results.push_back(Res.getValue(1));
34552 return;
34553 }
34554
34555 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34556 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34557 SDValue VBias = DAG.getConstantFP(
34558 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34559 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34560 DAG.getBitcast(MVT::v2i64, VBias));
34561 Or = DAG.getBitcast(MVT::v2f64, Or);
34562 if (IsStrict) {
34563 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34564 {N->getOperand(0), Or, VBias});
34566 {MVT::v4f32, MVT::Other},
34567 {Sub.getValue(1), Sub});
34568 Results.push_back(Res);
34569 Results.push_back(Res.getValue(1));
34570 } else {
34571 // TODO: Are there any fast-math-flags to propagate here?
34572 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34573 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34574 }
34575 return;
34576 }
34578 case ISD::FP_ROUND: {
34579 bool IsStrict = N->isStrictFPOpcode();
34580 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34581 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34582 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34583 EVT SrcVT = Src.getValueType();
34584 EVT VT = N->getValueType(0);
34585 SDValue V;
34586 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34587 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34588 : DAG.getUNDEF(MVT::v2f32);
34589 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34590 }
34591 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34592 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34593 if (SrcVT.getVectorElementType() != MVT::f32)
34594 return;
34595
34596 if (IsStrict)
34597 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34598 {Chain, Src, Rnd});
34599 else
34600 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34601
34602 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34603 if (IsStrict)
34604 Results.push_back(V.getValue(1));
34605 return;
34606 }
34607 if (!isTypeLegal(Src.getValueType()))
34608 return;
34609 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34610 if (IsStrict)
34611 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34612 {Chain, Src});
34613 else
34614 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34615 Results.push_back(V);
34616 if (IsStrict)
34617 Results.push_back(V.getValue(1));
34618 return;
34619 }
34620 case ISD::FP_EXTEND:
34621 case ISD::STRICT_FP_EXTEND: {
34622 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34623 // No other ValueType for FP_EXTEND should reach this point.
34624 assert(N->getValueType(0) == MVT::v2f32 &&
34625 "Do not know how to legalize this Node");
34626 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34627 return;
34628 bool IsStrict = N->isStrictFPOpcode();
34629 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34630 if (Src.getValueType().getVectorElementType() != MVT::f16)
34631 return;
34632 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34633 : DAG.getUNDEF(MVT::v2f16);
34634 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34635 if (IsStrict)
34636 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34637 {N->getOperand(0), V});
34638 else
34639 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34640 Results.push_back(V);
34641 if (IsStrict)
34642 Results.push_back(V.getValue(1));
34643 return;
34644 }
34646 unsigned IntNo = N->getConstantOperandVal(1);
34647 switch (IntNo) {
34648 default : llvm_unreachable("Do not know how to custom type "
34649 "legalize this intrinsic operation!");
34650 case Intrinsic::x86_rdtsc:
34651 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34652 Results);
34653 case Intrinsic::x86_rdtscp:
34654 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34655 Results);
34656 case Intrinsic::x86_rdpmc:
34657 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34658 Results);
34659 return;
34660 case Intrinsic::x86_rdpru:
34661 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34662 Results);
34663 return;
34664 case Intrinsic::x86_xgetbv:
34665 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34666 Results);
34667 return;
34668 }
34669 }
34670 case ISD::READCYCLECOUNTER: {
34671 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34672 }
34673 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34674 EVT T = N->getValueType(0);
34675 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34676 bool Regs64bit = T == MVT::i128;
34677 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34678 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34679 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34680 SDValue cpInL, cpInH;
34681 std::tie(cpInL, cpInH) =
34682 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34683 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34684 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34685 cpInH =
34686 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34687 cpInH, cpInL.getValue(1));
34688 SDValue swapInL, swapInH;
34689 std::tie(swapInL, swapInH) =
34690 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34691 swapInH =
34692 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34693 swapInH, cpInH.getValue(1));
34694
34695 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34696 // until later. So we keep the RBX input in a vreg and use a custom
34697 // inserter.
34698 // Since RBX will be a reserved register the register allocator will not
34699 // make sure its value will be properly saved and restored around this
34700 // live-range.
34701 SDValue Result;
34702 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34703 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34704 if (Regs64bit) {
34705 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34706 swapInH.getValue(1)};
34707 Result =
34708 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34709 } else {
34710 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34711 swapInH.getValue(1));
34712 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34713 swapInL.getValue(1)};
34714 Result =
34715 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34716 }
34717
34718 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34719 Regs64bit ? X86::RAX : X86::EAX,
34720 HalfT, Result.getValue(1));
34721 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34722 Regs64bit ? X86::RDX : X86::EDX,
34723 HalfT, cpOutL.getValue(2));
34724 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34725
34726 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34727 MVT::i32, cpOutH.getValue(2));
34728 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34729 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34730
34731 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34732 Results.push_back(Success);
34733 Results.push_back(EFLAGS.getValue(1));
34734 return;
34735 }
34736 case ISD::ATOMIC_LOAD: {
34737 assert(
34738 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34739 "Unexpected VT!");
34740 bool NoImplicitFloatOps =
34742 Attribute::NoImplicitFloat);
34743 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34744 auto *Node = cast<AtomicSDNode>(N);
34745
34746 if (N->getValueType(0) == MVT::i128) {
34747 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34748 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34749 Node->getBasePtr(), Node->getMemOperand());
34750 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34751 DAG.getVectorIdxConstant(0, dl));
34752 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34753 DAG.getVectorIdxConstant(1, dl));
34754 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34755 {ResL, ResH}));
34756 Results.push_back(Ld.getValue(1));
34757 return;
34758 }
34759 break;
34760 }
34761 if (Subtarget.hasSSE1()) {
34762 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34763 // Then extract the lower 64-bits.
34764 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34765 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34766 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34768 MVT::i64, Node->getMemOperand());
34769 if (Subtarget.hasSSE2()) {
34770 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34771 DAG.getVectorIdxConstant(0, dl));
34772 Results.push_back(Res);
34773 Results.push_back(Ld.getValue(1));
34774 return;
34775 }
34776 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34777 // then casts to i64. This avoids a 128-bit stack temporary being
34778 // created by type legalization if we were to cast v4f32->v2i64.
34779 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34780 DAG.getVectorIdxConstant(0, dl));
34781 Res = DAG.getBitcast(MVT::i64, Res);
34782 Results.push_back(Res);
34783 Results.push_back(Ld.getValue(1));
34784 return;
34785 }
34786 if (Subtarget.hasX87()) {
34787 // First load this into an 80-bit X87 register. This will put the whole
34788 // integer into the significand.
34789 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34790 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34792 dl, Tys, Ops, MVT::i64,
34793 Node->getMemOperand());
34794 SDValue Chain = Result.getValue(1);
34795
34796 // Now store the X87 register to a stack temporary and convert to i64.
34797 // This store is not atomic and doesn't need to be.
34798 // FIXME: We don't need a stack temporary if the result of the load
34799 // is already being stored. We could just directly store there.
34800 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34801 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34802 MachinePointerInfo MPI =
34804 SDValue StoreOps[] = { Chain, Result, StackPtr };
34805 Chain = DAG.getMemIntrinsicNode(
34806 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34807 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34808
34809 // Finally load the value back from the stack temporary and return it.
34810 // This load is not atomic and doesn't need to be.
34811 // This load will be further type legalized.
34812 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34813 Results.push_back(Result);
34814 Results.push_back(Result.getValue(1));
34815 return;
34816 }
34817 }
34818 // TODO: Use MOVLPS when SSE1 is available?
34819 // Delegate to generic TypeLegalization. Situations we can really handle
34820 // should have already been dealt with by AtomicExpandPass.cpp.
34821 break;
34822 }
34823 case ISD::ATOMIC_SWAP:
34824 case ISD::ATOMIC_LOAD_ADD:
34825 case ISD::ATOMIC_LOAD_SUB:
34826 case ISD::ATOMIC_LOAD_AND:
34827 case ISD::ATOMIC_LOAD_OR:
34828 case ISD::ATOMIC_LOAD_XOR:
34829 case ISD::ATOMIC_LOAD_NAND:
34830 case ISD::ATOMIC_LOAD_MIN:
34831 case ISD::ATOMIC_LOAD_MAX:
34832 case ISD::ATOMIC_LOAD_UMIN:
34833 case ISD::ATOMIC_LOAD_UMAX:
34834 // Delegate to generic TypeLegalization. Situations we can really handle
34835 // should have already been dealt with by AtomicExpandPass.cpp.
34836 break;
34837
34838 case ISD::BITCAST: {
34839 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34840 EVT DstVT = N->getValueType(0);
34841 EVT SrcVT = N->getOperand(0).getValueType();
34842
34843 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34844 // we can split using the k-register rather than memory.
34845 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34846 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34847 SDValue Lo, Hi;
34848 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34849 Lo = DAG.getBitcast(MVT::i32, Lo);
34850 Hi = DAG.getBitcast(MVT::i32, Hi);
34851 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34852 Results.push_back(Res);
34853 return;
34854 }
34855
34856 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34857 // FIXME: Use v4f32 for SSE1?
34858 assert(Subtarget.hasSSE2() && "Requires SSE2");
34859 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34860 "Unexpected type action!");
34861 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34862 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34863 N->getOperand(0));
34864 Res = DAG.getBitcast(WideVT, Res);
34865 Results.push_back(Res);
34866 return;
34867 }
34868
34869 return;
34870 }
34871 case ISD::MGATHER: {
34872 EVT VT = N->getValueType(0);
34873 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34874 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34875 auto *Gather = cast<MaskedGatherSDNode>(N);
34876 SDValue Index = Gather->getIndex();
34877 if (Index.getValueType() != MVT::v2i64)
34878 return;
34880 "Unexpected type action!");
34881 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34882 SDValue Mask = Gather->getMask();
34883 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34884 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34885 Gather->getPassThru(),
34886 DAG.getUNDEF(VT));
34887 if (!Subtarget.hasVLX()) {
34888 // We need to widen the mask, but the instruction will only use 2
34889 // of its elements. So we can use undef.
34890 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34891 DAG.getUNDEF(MVT::v2i1));
34892 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34893 }
34894 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34895 Gather->getBasePtr(), Index, Gather->getScale() };
34896 SDValue Res = DAG.getMemIntrinsicNode(
34897 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34898 Gather->getMemoryVT(), Gather->getMemOperand());
34899 Results.push_back(Res);
34900 Results.push_back(Res.getValue(1));
34901 return;
34902 }
34903 return;
34904 }
34905 case ISD::LOAD: {
34906 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34907 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34908 // cast since type legalization will try to use an i64 load.
34909 MVT VT = N->getSimpleValueType(0);
34910 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34912 "Unexpected type action!");
34913 if (!ISD::isNON_EXTLoad(N))
34914 return;
34915 auto *Ld = cast<LoadSDNode>(N);
34916 if (Subtarget.hasSSE2()) {
34917 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34918 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34919 Ld->getPointerInfo(), Ld->getBaseAlign(),
34920 Ld->getMemOperand()->getFlags());
34921 SDValue Chain = Res.getValue(1);
34922 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34923 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34924 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34925 Res = DAG.getBitcast(WideVT, Res);
34926 Results.push_back(Res);
34927 Results.push_back(Chain);
34928 return;
34929 }
34930 assert(Subtarget.hasSSE1() && "Expected SSE");
34931 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34932 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34934 MVT::i64, Ld->getMemOperand());
34935 Results.push_back(Res);
34936 Results.push_back(Res.getValue(1));
34937 return;
34938 }
34939 case ISD::ADDRSPACECAST: {
34940 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34941 Results.push_back(V);
34942 return;
34943 }
34944 case ISD::BITREVERSE: {
34945 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34946 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34947 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34948 // We'll need to move the scalar in two i32 pieces.
34949 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34950 return;
34951 }
34953 // f16 = extract vXf16 %vec, i64 %idx
34954 assert(N->getSimpleValueType(0) == MVT::f16 &&
34955 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34956 assert(Subtarget.hasFP16() && "Expected FP16");
34957 SDValue VecOp = N->getOperand(0);
34959 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34960 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34961 N->getOperand(1));
34962 Split = DAG.getBitcast(MVT::f16, Split);
34963 Results.push_back(Split);
34964 return;
34965 }
34966 }
34967}
34968
34969const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34970 switch ((X86ISD::NodeType)Opcode) {
34971 case X86ISD::FIRST_NUMBER: break;
34972#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34973 NODE_NAME_CASE(BSF)
34974 NODE_NAME_CASE(BSR)
34975 NODE_NAME_CASE(FSHL)
34976 NODE_NAME_CASE(FSHR)
34977 NODE_NAME_CASE(FAND)
34978 NODE_NAME_CASE(FANDN)
34979 NODE_NAME_CASE(FOR)
34980 NODE_NAME_CASE(FXOR)
34981 NODE_NAME_CASE(FILD)
34982 NODE_NAME_CASE(FIST)
34983 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34984 NODE_NAME_CASE(FLD)
34985 NODE_NAME_CASE(FST)
34986 NODE_NAME_CASE(CALL)
34987 NODE_NAME_CASE(CALL_RVMARKER)
34988 NODE_NAME_CASE(IMP_CALL)
34990 NODE_NAME_CASE(CMP)
34991 NODE_NAME_CASE(FCMP)
34992 NODE_NAME_CASE(STRICT_FCMP)
34993 NODE_NAME_CASE(STRICT_FCMPS)
34995 NODE_NAME_CASE(UCOMI)
34996 NODE_NAME_CASE(COMX)
34997 NODE_NAME_CASE(UCOMX)
34998 NODE_NAME_CASE(CMPM)
34999 NODE_NAME_CASE(CMPMM)
35000 NODE_NAME_CASE(STRICT_CMPM)
35001 NODE_NAME_CASE(CMPMM_SAE)
35002 NODE_NAME_CASE(SETCC)
35003 NODE_NAME_CASE(SETCC_CARRY)
35004 NODE_NAME_CASE(FSETCC)
35005 NODE_NAME_CASE(FSETCCM)
35006 NODE_NAME_CASE(FSETCCM_SAE)
35007 NODE_NAME_CASE(CMOV)
35008 NODE_NAME_CASE(BRCOND)
35009 NODE_NAME_CASE(RET_GLUE)
35010 NODE_NAME_CASE(IRET)
35011 NODE_NAME_CASE(REP_STOS)
35012 NODE_NAME_CASE(REP_MOVS)
35013 NODE_NAME_CASE(GlobalBaseReg)
35015 NODE_NAME_CASE(WrapperRIP)
35016 NODE_NAME_CASE(MOVQ2DQ)
35017 NODE_NAME_CASE(MOVDQ2Q)
35018 NODE_NAME_CASE(MMX_MOVD2W)
35019 NODE_NAME_CASE(MMX_MOVW2D)
35020 NODE_NAME_CASE(PEXTRB)
35021 NODE_NAME_CASE(PEXTRW)
35022 NODE_NAME_CASE(INSERTPS)
35023 NODE_NAME_CASE(PINSRB)
35024 NODE_NAME_CASE(PINSRW)
35025 NODE_NAME_CASE(PSHUFB)
35026 NODE_NAME_CASE(ANDNP)
35027 NODE_NAME_CASE(BLENDI)
35029 NODE_NAME_CASE(HADD)
35030 NODE_NAME_CASE(HSUB)
35031 NODE_NAME_CASE(FHADD)
35032 NODE_NAME_CASE(FHSUB)
35033 NODE_NAME_CASE(CONFLICT)
35034 NODE_NAME_CASE(FMAX)
35035 NODE_NAME_CASE(FMAXS)
35036 NODE_NAME_CASE(FMAX_SAE)
35037 NODE_NAME_CASE(FMAXS_SAE)
35038 NODE_NAME_CASE(STRICT_FMAX)
35039 NODE_NAME_CASE(FMIN)
35040 NODE_NAME_CASE(FMINS)
35041 NODE_NAME_CASE(FMIN_SAE)
35042 NODE_NAME_CASE(FMINS_SAE)
35043 NODE_NAME_CASE(STRICT_FMIN)
35044 NODE_NAME_CASE(FMAXC)
35045 NODE_NAME_CASE(FMINC)
35046 NODE_NAME_CASE(FRSQRT)
35047 NODE_NAME_CASE(FRCP)
35048 NODE_NAME_CASE(EXTRQI)
35049 NODE_NAME_CASE(INSERTQI)
35050 NODE_NAME_CASE(TLSADDR)
35051 NODE_NAME_CASE(TLSBASEADDR)
35052 NODE_NAME_CASE(TLSCALL)
35053 NODE_NAME_CASE(TLSDESC)
35054 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35055 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35056 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35057 NODE_NAME_CASE(EH_RETURN)
35058 NODE_NAME_CASE(TC_RETURN)
35059 NODE_NAME_CASE(FNSTCW16m)
35060 NODE_NAME_CASE(FLDCW16m)
35061 NODE_NAME_CASE(FNSTENVm)
35062 NODE_NAME_CASE(FLDENVm)
35063 NODE_NAME_CASE(LCMPXCHG_DAG)
35064 NODE_NAME_CASE(LCMPXCHG8_DAG)
35065 NODE_NAME_CASE(LCMPXCHG16_DAG)
35066 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35067 NODE_NAME_CASE(LADD)
35068 NODE_NAME_CASE(LSUB)
35069 NODE_NAME_CASE(LOR)
35070 NODE_NAME_CASE(LXOR)
35071 NODE_NAME_CASE(LAND)
35072 NODE_NAME_CASE(LBTS)
35073 NODE_NAME_CASE(LBTC)
35074 NODE_NAME_CASE(LBTR)
35075 NODE_NAME_CASE(LBTS_RM)
35076 NODE_NAME_CASE(LBTC_RM)
35077 NODE_NAME_CASE(LBTR_RM)
35078 NODE_NAME_CASE(AADD)
35079 NODE_NAME_CASE(AOR)
35080 NODE_NAME_CASE(AXOR)
35081 NODE_NAME_CASE(AAND)
35082 NODE_NAME_CASE(VZEXT_MOVL)
35083 NODE_NAME_CASE(VZEXT_LOAD)
35084 NODE_NAME_CASE(VEXTRACT_STORE)
35085 NODE_NAME_CASE(VTRUNC)
35086 NODE_NAME_CASE(VTRUNCS)
35087 NODE_NAME_CASE(VTRUNCUS)
35088 NODE_NAME_CASE(VMTRUNC)
35089 NODE_NAME_CASE(VMTRUNCS)
35090 NODE_NAME_CASE(VMTRUNCUS)
35091 NODE_NAME_CASE(VTRUNCSTORES)
35092 NODE_NAME_CASE(VTRUNCSTOREUS)
35093 NODE_NAME_CASE(VMTRUNCSTORES)
35094 NODE_NAME_CASE(VMTRUNCSTOREUS)
35095 NODE_NAME_CASE(VFPEXT)
35096 NODE_NAME_CASE(STRICT_VFPEXT)
35097 NODE_NAME_CASE(VFPEXT_SAE)
35098 NODE_NAME_CASE(VFPEXTS)
35099 NODE_NAME_CASE(VFPEXTS_SAE)
35100 NODE_NAME_CASE(VFPROUND)
35101 NODE_NAME_CASE(VFPROUND2)
35102 NODE_NAME_CASE(VFPROUND2_RND)
35103 NODE_NAME_CASE(STRICT_VFPROUND)
35104 NODE_NAME_CASE(VMFPROUND)
35105 NODE_NAME_CASE(VFPROUND_RND)
35106 NODE_NAME_CASE(VFPROUNDS)
35107 NODE_NAME_CASE(VFPROUNDS_RND)
35108 NODE_NAME_CASE(VSHLDQ)
35109 NODE_NAME_CASE(VSRLDQ)
35110 NODE_NAME_CASE(VSHL)
35111 NODE_NAME_CASE(VSRL)
35112 NODE_NAME_CASE(VSRA)
35113 NODE_NAME_CASE(VSHLI)
35114 NODE_NAME_CASE(VSRLI)
35115 NODE_NAME_CASE(VSRAI)
35116 NODE_NAME_CASE(VSHLV)
35117 NODE_NAME_CASE(VSRLV)
35118 NODE_NAME_CASE(VSRAV)
35119 NODE_NAME_CASE(VROTLI)
35120 NODE_NAME_CASE(VROTRI)
35121 NODE_NAME_CASE(VPPERM)
35122 NODE_NAME_CASE(CMPP)
35123 NODE_NAME_CASE(STRICT_CMPP)
35124 NODE_NAME_CASE(PCMPEQ)
35125 NODE_NAME_CASE(PCMPGT)
35126 NODE_NAME_CASE(PHMINPOS)
35127 NODE_NAME_CASE(ADD)
35128 NODE_NAME_CASE(SUB)
35129 NODE_NAME_CASE(ADC)
35130 NODE_NAME_CASE(SBB)
35131 NODE_NAME_CASE(SMUL)
35132 NODE_NAME_CASE(UMUL)
35133 NODE_NAME_CASE(OR)
35134 NODE_NAME_CASE(XOR)
35135 NODE_NAME_CASE(AND)
35136 NODE_NAME_CASE(BEXTR)
35138 NODE_NAME_CASE(BZHI)
35139 NODE_NAME_CASE(PDEP)
35140 NODE_NAME_CASE(PEXT)
35141 NODE_NAME_CASE(MUL_IMM)
35142 NODE_NAME_CASE(MOVMSK)
35143 NODE_NAME_CASE(PTEST)
35144 NODE_NAME_CASE(TESTP)
35145 NODE_NAME_CASE(KORTEST)
35146 NODE_NAME_CASE(KTEST)
35147 NODE_NAME_CASE(KADD)
35148 NODE_NAME_CASE(KSHIFTL)
35149 NODE_NAME_CASE(KSHIFTR)
35150 NODE_NAME_CASE(PACKSS)
35151 NODE_NAME_CASE(PACKUS)
35152 NODE_NAME_CASE(PALIGNR)
35153 NODE_NAME_CASE(VALIGN)
35154 NODE_NAME_CASE(VSHLD)
35155 NODE_NAME_CASE(VSHRD)
35156 NODE_NAME_CASE(VSHLDV)
35157 NODE_NAME_CASE(VSHRDV)
35158 NODE_NAME_CASE(PSHUFD)
35159 NODE_NAME_CASE(PSHUFHW)
35160 NODE_NAME_CASE(PSHUFLW)
35161 NODE_NAME_CASE(SHUFP)
35162 NODE_NAME_CASE(SHUF128)
35163 NODE_NAME_CASE(MOVLHPS)
35164 NODE_NAME_CASE(MOVHLPS)
35165 NODE_NAME_CASE(MOVDDUP)
35166 NODE_NAME_CASE(MOVSHDUP)
35167 NODE_NAME_CASE(MOVSLDUP)
35168 NODE_NAME_CASE(MOVSD)
35169 NODE_NAME_CASE(MOVSS)
35170 NODE_NAME_CASE(MOVSH)
35171 NODE_NAME_CASE(UNPCKL)
35172 NODE_NAME_CASE(UNPCKH)
35173 NODE_NAME_CASE(VBROADCAST)
35174 NODE_NAME_CASE(VBROADCAST_LOAD)
35175 NODE_NAME_CASE(VBROADCASTM)
35176 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35177 NODE_NAME_CASE(VPERMILPV)
35178 NODE_NAME_CASE(VPERMILPI)
35179 NODE_NAME_CASE(VPERM2X128)
35180 NODE_NAME_CASE(VPERMV)
35181 NODE_NAME_CASE(VPERMV3)
35182 NODE_NAME_CASE(VPERMI)
35183 NODE_NAME_CASE(VPTERNLOG)
35184 NODE_NAME_CASE(FP_TO_SINT_SAT)
35185 NODE_NAME_CASE(FP_TO_UINT_SAT)
35186 NODE_NAME_CASE(VFIXUPIMM)
35187 NODE_NAME_CASE(VFIXUPIMM_SAE)
35188 NODE_NAME_CASE(VFIXUPIMMS)
35189 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35190 NODE_NAME_CASE(VRANGE)
35191 NODE_NAME_CASE(VRANGE_SAE)
35192 NODE_NAME_CASE(VRANGES)
35193 NODE_NAME_CASE(VRANGES_SAE)
35194 NODE_NAME_CASE(PMULUDQ)
35195 NODE_NAME_CASE(PMULDQ)
35196 NODE_NAME_CASE(PSADBW)
35197 NODE_NAME_CASE(DBPSADBW)
35198 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35199 NODE_NAME_CASE(VAARG_64)
35200 NODE_NAME_CASE(VAARG_X32)
35201 NODE_NAME_CASE(DYN_ALLOCA)
35202 NODE_NAME_CASE(MFENCE)
35203 NODE_NAME_CASE(SEG_ALLOCA)
35204 NODE_NAME_CASE(PROBED_ALLOCA)
35207 NODE_NAME_CASE(RDPKRU)
35208 NODE_NAME_CASE(WRPKRU)
35209 NODE_NAME_CASE(VPMADDUBSW)
35210 NODE_NAME_CASE(VPMADDWD)
35211 NODE_NAME_CASE(VPSHA)
35212 NODE_NAME_CASE(VPSHL)
35213 NODE_NAME_CASE(VPCOM)
35214 NODE_NAME_CASE(VPCOMU)
35215 NODE_NAME_CASE(VPERMIL2)
35217 NODE_NAME_CASE(STRICT_FMSUB)
35219 NODE_NAME_CASE(STRICT_FNMADD)
35221 NODE_NAME_CASE(STRICT_FNMSUB)
35222 NODE_NAME_CASE(FMADDSUB)
35223 NODE_NAME_CASE(FMSUBADD)
35224 NODE_NAME_CASE(FMADD_RND)
35225 NODE_NAME_CASE(FNMADD_RND)
35226 NODE_NAME_CASE(FMSUB_RND)
35227 NODE_NAME_CASE(FNMSUB_RND)
35228 NODE_NAME_CASE(FMADDSUB_RND)
35229 NODE_NAME_CASE(FMSUBADD_RND)
35230 NODE_NAME_CASE(VFMADDC)
35231 NODE_NAME_CASE(VFMADDC_RND)
35232 NODE_NAME_CASE(VFCMADDC)
35233 NODE_NAME_CASE(VFCMADDC_RND)
35234 NODE_NAME_CASE(VFMULC)
35235 NODE_NAME_CASE(VFMULC_RND)
35236 NODE_NAME_CASE(VFCMULC)
35237 NODE_NAME_CASE(VFCMULC_RND)
35238 NODE_NAME_CASE(VFMULCSH)
35239 NODE_NAME_CASE(VFMULCSH_RND)
35240 NODE_NAME_CASE(VFCMULCSH)
35241 NODE_NAME_CASE(VFCMULCSH_RND)
35242 NODE_NAME_CASE(VFMADDCSH)
35243 NODE_NAME_CASE(VFMADDCSH_RND)
35244 NODE_NAME_CASE(VFCMADDCSH)
35245 NODE_NAME_CASE(VFCMADDCSH_RND)
35246 NODE_NAME_CASE(VPMADD52H)
35247 NODE_NAME_CASE(VPMADD52L)
35248 NODE_NAME_CASE(VRNDSCALE)
35249 NODE_NAME_CASE(STRICT_VRNDSCALE)
35250 NODE_NAME_CASE(VRNDSCALE_SAE)
35251 NODE_NAME_CASE(VRNDSCALES)
35252 NODE_NAME_CASE(VRNDSCALES_SAE)
35253 NODE_NAME_CASE(VREDUCE)
35254 NODE_NAME_CASE(VREDUCE_SAE)
35255 NODE_NAME_CASE(VREDUCES)
35256 NODE_NAME_CASE(VREDUCES_SAE)
35257 NODE_NAME_CASE(VGETMANT)
35258 NODE_NAME_CASE(VGETMANT_SAE)
35259 NODE_NAME_CASE(VGETMANTS)
35260 NODE_NAME_CASE(VGETMANTS_SAE)
35261 NODE_NAME_CASE(PCMPESTR)
35262 NODE_NAME_CASE(PCMPISTR)
35264 NODE_NAME_CASE(COMPRESS)
35266 NODE_NAME_CASE(SELECTS)
35267 NODE_NAME_CASE(ADDSUB)
35268 NODE_NAME_CASE(RCP14)
35269 NODE_NAME_CASE(RCP14S)
35270 NODE_NAME_CASE(RSQRT14)
35271 NODE_NAME_CASE(RSQRT14S)
35272 NODE_NAME_CASE(FADD_RND)
35273 NODE_NAME_CASE(FADDS)
35274 NODE_NAME_CASE(FADDS_RND)
35275 NODE_NAME_CASE(FSUB_RND)
35276 NODE_NAME_CASE(FSUBS)
35277 NODE_NAME_CASE(FSUBS_RND)
35278 NODE_NAME_CASE(FMUL_RND)
35279 NODE_NAME_CASE(FMULS)
35280 NODE_NAME_CASE(FMULS_RND)
35281 NODE_NAME_CASE(FDIV_RND)
35282 NODE_NAME_CASE(FDIVS)
35283 NODE_NAME_CASE(FDIVS_RND)
35284 NODE_NAME_CASE(FSQRT_RND)
35285 NODE_NAME_CASE(FSQRTS)
35286 NODE_NAME_CASE(FSQRTS_RND)
35287 NODE_NAME_CASE(FGETEXP)
35288 NODE_NAME_CASE(FGETEXP_SAE)
35289 NODE_NAME_CASE(FGETEXPS)
35290 NODE_NAME_CASE(FGETEXPS_SAE)
35291 NODE_NAME_CASE(SCALEF)
35292 NODE_NAME_CASE(SCALEF_RND)
35293 NODE_NAME_CASE(SCALEFS)
35294 NODE_NAME_CASE(SCALEFS_RND)
35295 NODE_NAME_CASE(MULHRS)
35296 NODE_NAME_CASE(SINT_TO_FP_RND)
35297 NODE_NAME_CASE(UINT_TO_FP_RND)
35298 NODE_NAME_CASE(CVTTP2SI)
35299 NODE_NAME_CASE(CVTTP2UI)
35300 NODE_NAME_CASE(STRICT_CVTTP2SI)
35301 NODE_NAME_CASE(STRICT_CVTTP2UI)
35302 NODE_NAME_CASE(MCVTTP2SI)
35303 NODE_NAME_CASE(MCVTTP2UI)
35304 NODE_NAME_CASE(CVTTP2SI_SAE)
35305 NODE_NAME_CASE(CVTTP2UI_SAE)
35306 NODE_NAME_CASE(CVTTS2SI)
35307 NODE_NAME_CASE(CVTTS2UI)
35308 NODE_NAME_CASE(CVTTS2SI_SAE)
35309 NODE_NAME_CASE(CVTTS2UI_SAE)
35310 NODE_NAME_CASE(CVTSI2P)
35311 NODE_NAME_CASE(CVTUI2P)
35312 NODE_NAME_CASE(STRICT_CVTSI2P)
35313 NODE_NAME_CASE(STRICT_CVTUI2P)
35314 NODE_NAME_CASE(MCVTSI2P)
35315 NODE_NAME_CASE(MCVTUI2P)
35316 NODE_NAME_CASE(VFPCLASS)
35317 NODE_NAME_CASE(VFPCLASSS)
35318 NODE_NAME_CASE(MULTISHIFT)
35319 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35320 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35321 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35322 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35323 NODE_NAME_CASE(CVTPS2PH)
35324 NODE_NAME_CASE(STRICT_CVTPS2PH)
35325 NODE_NAME_CASE(CVTPS2PH_SAE)
35326 NODE_NAME_CASE(MCVTPS2PH)
35327 NODE_NAME_CASE(MCVTPS2PH_SAE)
35328 NODE_NAME_CASE(CVTPH2PS)
35329 NODE_NAME_CASE(STRICT_CVTPH2PS)
35330 NODE_NAME_CASE(CVTPH2PS_SAE)
35331 NODE_NAME_CASE(CVTP2SI)
35332 NODE_NAME_CASE(CVTP2UI)
35333 NODE_NAME_CASE(MCVTP2SI)
35334 NODE_NAME_CASE(MCVTP2UI)
35335 NODE_NAME_CASE(CVTP2SI_RND)
35336 NODE_NAME_CASE(CVTP2UI_RND)
35337 NODE_NAME_CASE(CVTS2SI)
35338 NODE_NAME_CASE(CVTS2UI)
35339 NODE_NAME_CASE(CVTS2SI_RND)
35340 NODE_NAME_CASE(CVTS2UI_RND)
35341 NODE_NAME_CASE(CVTNEPS2BF16)
35342 NODE_NAME_CASE(MCVTNEPS2BF16)
35343 NODE_NAME_CASE(DPBF16PS)
35344 NODE_NAME_CASE(DPFP16PS)
35345 NODE_NAME_CASE(MPSADBW)
35346 NODE_NAME_CASE(LWPINS)
35347 NODE_NAME_CASE(MGATHER)
35348 NODE_NAME_CASE(MSCATTER)
35349 NODE_NAME_CASE(VPDPBUSD)
35350 NODE_NAME_CASE(VPDPBUSDS)
35351 NODE_NAME_CASE(VPDPWSSD)
35352 NODE_NAME_CASE(VPDPWSSDS)
35353 NODE_NAME_CASE(VPSHUFBITQMB)
35354 NODE_NAME_CASE(GF2P8MULB)
35355 NODE_NAME_CASE(GF2P8AFFINEQB)
35356 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35357 NODE_NAME_CASE(NT_CALL)
35358 NODE_NAME_CASE(NT_BRIND)
35359 NODE_NAME_CASE(UMWAIT)
35360 NODE_NAME_CASE(TPAUSE)
35361 NODE_NAME_CASE(ENQCMD)
35362 NODE_NAME_CASE(ENQCMDS)
35363 NODE_NAME_CASE(VP2INTERSECT)
35364 NODE_NAME_CASE(VPDPBSUD)
35365 NODE_NAME_CASE(VPDPBSUDS)
35366 NODE_NAME_CASE(VPDPBUUD)
35367 NODE_NAME_CASE(VPDPBUUDS)
35368 NODE_NAME_CASE(VPDPBSSD)
35369 NODE_NAME_CASE(VPDPBSSDS)
35370 NODE_NAME_CASE(VPDPWSUD)
35371 NODE_NAME_CASE(VPDPWSUDS)
35372 NODE_NAME_CASE(VPDPWUSD)
35373 NODE_NAME_CASE(VPDPWUSDS)
35374 NODE_NAME_CASE(VPDPWUUD)
35375 NODE_NAME_CASE(VPDPWUUDS)
35376 NODE_NAME_CASE(VMINMAX)
35377 NODE_NAME_CASE(VMINMAX_SAE)
35378 NODE_NAME_CASE(VMINMAXS)
35379 NODE_NAME_CASE(VMINMAXS_SAE)
35380 NODE_NAME_CASE(CVTP2IBS)
35381 NODE_NAME_CASE(CVTP2IUBS)
35382 NODE_NAME_CASE(CVTP2IBS_RND)
35383 NODE_NAME_CASE(CVTP2IUBS_RND)
35384 NODE_NAME_CASE(CVTTP2IBS)
35385 NODE_NAME_CASE(CVTTP2IUBS)
35386 NODE_NAME_CASE(CVTTP2IBS_SAE)
35387 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35388 NODE_NAME_CASE(VCVT2PH2BF8)
35389 NODE_NAME_CASE(VCVT2PH2BF8S)
35390 NODE_NAME_CASE(VCVT2PH2HF8)
35391 NODE_NAME_CASE(VCVT2PH2HF8S)
35392 NODE_NAME_CASE(VCVTBIASPH2BF8)
35393 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35394 NODE_NAME_CASE(VCVTBIASPH2HF8)
35395 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35396 NODE_NAME_CASE(VCVTPH2BF8)
35397 NODE_NAME_CASE(VCVTPH2BF8S)
35398 NODE_NAME_CASE(VCVTPH2HF8)
35399 NODE_NAME_CASE(VCVTPH2HF8S)
35400 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35401 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35402 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35403 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35404 NODE_NAME_CASE(VMCVTPH2BF8)
35405 NODE_NAME_CASE(VMCVTPH2BF8S)
35406 NODE_NAME_CASE(VMCVTPH2HF8)
35407 NODE_NAME_CASE(VMCVTPH2HF8S)
35408 NODE_NAME_CASE(VCVTHF82PH)
35409 NODE_NAME_CASE(AESENC128KL)
35410 NODE_NAME_CASE(AESDEC128KL)
35411 NODE_NAME_CASE(AESENC256KL)
35412 NODE_NAME_CASE(AESDEC256KL)
35413 NODE_NAME_CASE(AESENCWIDE128KL)
35414 NODE_NAME_CASE(AESDECWIDE128KL)
35415 NODE_NAME_CASE(AESENCWIDE256KL)
35416 NODE_NAME_CASE(AESDECWIDE256KL)
35417 NODE_NAME_CASE(CMPCCXADD)
35418 NODE_NAME_CASE(TESTUI)
35419 NODE_NAME_CASE(FP80_ADD)
35420 NODE_NAME_CASE(STRICT_FP80_ADD)
35421 NODE_NAME_CASE(CCMP)
35422 NODE_NAME_CASE(CTEST)
35423 NODE_NAME_CASE(CLOAD)
35424 NODE_NAME_CASE(CSTORE)
35425 NODE_NAME_CASE(CVTTS2SIS)
35426 NODE_NAME_CASE(CVTTS2UIS)
35427 NODE_NAME_CASE(CVTTS2SIS_SAE)
35428 NODE_NAME_CASE(CVTTS2UIS_SAE)
35429 NODE_NAME_CASE(CVTTP2SIS)
35430 NODE_NAME_CASE(MCVTTP2SIS)
35431 NODE_NAME_CASE(CVTTP2UIS_SAE)
35432 NODE_NAME_CASE(CVTTP2SIS_SAE)
35433 NODE_NAME_CASE(CVTTP2UIS)
35434 NODE_NAME_CASE(MCVTTP2UIS)
35435 NODE_NAME_CASE(POP_FROM_X87_REG)
35436 }
35437 return nullptr;
35438#undef NODE_NAME_CASE
35439}
35440
35441/// Return true if the addressing mode represented by AM is legal for this
35442/// target, for a load/store of the specified type.
35444 const AddrMode &AM, Type *Ty,
35445 unsigned AS,
35446 Instruction *I) const {
35447 // X86 supports extremely general addressing modes.
35449
35450 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35451 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35452 return false;
35453
35454 if (AM.BaseGV) {
35455 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35456
35457 // If a reference to this global requires an extra load, we can't fold it.
35458 if (isGlobalStubReference(GVFlags))
35459 return false;
35460
35461 // If BaseGV requires a register for the PIC base, we cannot also have a
35462 // BaseReg specified.
35463 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35464 return false;
35465
35466 // If lower 4G is not available, then we must use rip-relative addressing.
35467 if ((M != CodeModel::Small || isPositionIndependent()) &&
35468 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35469 return false;
35470 }
35471
35472 switch (AM.Scale) {
35473 case 0:
35474 case 1:
35475 case 2:
35476 case 4:
35477 case 8:
35478 // These scales always work.
35479 break;
35480 case 3:
35481 case 5:
35482 case 9:
35483 // These scales are formed with basereg+scalereg. Only accept if there is
35484 // no basereg yet.
35485 if (AM.HasBaseReg)
35486 return false;
35487 break;
35488 default: // Other stuff never works.
35489 return false;
35490 }
35491
35492 return true;
35493}
35494
35495bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35496 switch (Opcode) {
35497 // These are non-commutative binops.
35498 // TODO: Add more X86ISD opcodes once we have test coverage.
35499 case X86ISD::ANDNP:
35500 case X86ISD::PCMPGT:
35501 case X86ISD::FMAX:
35502 case X86ISD::FMIN:
35503 case X86ISD::FANDN:
35504 case X86ISD::VPSHA:
35505 case X86ISD::VPSHL:
35506 case X86ISD::VSHLV:
35507 case X86ISD::VSRLV:
35508 case X86ISD::VSRAV:
35509 return true;
35510 }
35511
35512 return TargetLoweringBase::isBinOp(Opcode);
35513}
35514
35515bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35516 switch (Opcode) {
35517 // TODO: Add more X86ISD opcodes once we have test coverage.
35518 case X86ISD::PCMPEQ:
35519 case X86ISD::PMULDQ:
35520 case X86ISD::PMULUDQ:
35521 case X86ISD::FMAXC:
35522 case X86ISD::FMINC:
35523 case X86ISD::FAND:
35524 case X86ISD::FOR:
35525 case X86ISD::FXOR:
35526 return true;
35527 }
35528
35530}
35531
35533 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35534 return false;
35535 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35536 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35537 return NumBits1 > NumBits2;
35538}
35539
35541 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35542 return false;
35543
35544 if (!isTypeLegal(EVT::getEVT(Ty1)))
35545 return false;
35546
35547 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35548
35549 // Assuming the caller doesn't have a zeroext or signext return parameter,
35550 // truncation all the way down to i1 is valid.
35551 return true;
35552}
35553
35555 return isInt<32>(Imm);
35556}
35557
35559 // Can also use sub to handle negated immediates.
35560 return isInt<32>(Imm);
35561}
35562
35564 return isInt<32>(Imm);
35565}
35566
35568 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35569 return false;
35570 unsigned NumBits1 = VT1.getSizeInBits();
35571 unsigned NumBits2 = VT2.getSizeInBits();
35572 return NumBits1 > NumBits2;
35573}
35574
35576 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35577 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35578}
35579
35581 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35582 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35583}
35584
35586 EVT VT1 = Val.getValueType();
35587 if (isZExtFree(VT1, VT2))
35588 return true;
35589
35590 if (Val.getOpcode() != ISD::LOAD)
35591 return false;
35592
35593 if (!VT1.isSimple() || !VT1.isInteger() ||
35594 !VT2.isSimple() || !VT2.isInteger())
35595 return false;
35596
35597 switch (VT1.getSimpleVT().SimpleTy) {
35598 default: break;
35599 case MVT::i8:
35600 case MVT::i16:
35601 case MVT::i32:
35602 // X86 has 8, 16, and 32-bit zero-extending loads.
35603 return true;
35604 }
35605
35606 return false;
35607}
35608
35610 if (!Subtarget.is64Bit())
35611 return false;
35612 return TargetLowering::shouldConvertPhiType(From, To);
35613}
35614
35616 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35617 return false;
35618
35619 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35620
35621 // There is no extending load for vXi1.
35622 if (SrcVT.getScalarType() == MVT::i1)
35623 return false;
35624
35625 return true;
35626}
35627
35629 EVT VT) const {
35630 if (Subtarget.useSoftFloat())
35631 return false;
35632
35633 if (!Subtarget.hasAnyFMA())
35634 return false;
35635
35636 VT = VT.getScalarType();
35637
35638 if (!VT.isSimple())
35639 return false;
35640
35641 switch (VT.getSimpleVT().SimpleTy) {
35642 case MVT::f16:
35643 return Subtarget.hasFP16();
35644 case MVT::f32:
35645 case MVT::f64:
35646 return true;
35647 default:
35648 break;
35649 }
35650
35651 return false;
35652}
35653
35655 EVT DestVT) const {
35656 // i16 instructions are longer (0x66 prefix) and potentially slower.
35657 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35658}
35659
35661 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35662 SDValue Y) const {
35663 if (SelectOpcode == ISD::SELECT) {
35664 if (VT.isVector())
35665 return false;
35666 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35667 return false;
35668 using namespace llvm::SDPatternMatch;
35669 // BLSI
35670 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35672 return true;
35673 // BLSR
35674 if (BinOpcode == ISD::AND &&
35677 return true;
35678 // BLSMSK
35679 if (BinOpcode == ISD::XOR &&
35682 return true;
35683
35684 return false;
35685 }
35686 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35687 // benefit. The transform may also be profitable for scalar code.
35688 if (!Subtarget.hasAVX512())
35689 return false;
35690 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35691 return false;
35692 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35693 return false;
35694
35695 return true;
35696}
35697
35698/// Targets can use this to indicate that they only support *some*
35699/// VECTOR_SHUFFLE operations, those with specific masks.
35700/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35701/// are assumed to be legal.
35703 if (!VT.isSimple())
35704 return false;
35705
35706 // Not for i1 vectors
35707 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35708 return false;
35709
35710 // Very little shuffling can be done for 64-bit vectors right now.
35711 if (VT.getSimpleVT().getSizeInBits() == 64)
35712 return false;
35713
35714 // We only care that the types being shuffled are legal. The lowering can
35715 // handle any possible shuffle mask that results.
35716 return isTypeLegal(VT.getSimpleVT());
35717}
35718
35720 EVT VT) const {
35721 // Don't convert an 'and' into a shuffle that we don't directly support.
35722 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35723 if (!Subtarget.hasAVX2())
35724 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35725 return false;
35726
35727 // Just delegate to the generic legality, clear masks aren't special.
35728 return isShuffleMaskLegal(Mask, VT);
35729}
35730
35732 // If the subtarget is using thunks, we need to not generate jump tables.
35733 if (Subtarget.useIndirectThunkBranches())
35734 return false;
35735
35736 // Otherwise, fallback on the generic logic.
35738}
35739
35741 EVT ConditionVT) const {
35742 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35743 // zero-extensions.
35744 if (ConditionVT.getSizeInBits() < 32)
35745 return MVT::i32;
35747 ConditionVT);
35748}
35749
35750//===----------------------------------------------------------------------===//
35751// X86 Scheduler Hooks
35752//===----------------------------------------------------------------------===//
35753
35754/// Utility function to emit xbegin specifying the start of an RTM region.
35756 const TargetInstrInfo *TII) {
35757 const MIMetadata MIMD(MI);
35758
35759 const BasicBlock *BB = MBB->getBasicBlock();
35760 MachineFunction::iterator I = ++MBB->getIterator();
35761
35762 // For the v = xbegin(), we generate
35763 //
35764 // thisMBB:
35765 // xbegin sinkMBB
35766 //
35767 // mainMBB:
35768 // s0 = -1
35769 //
35770 // fallBB:
35771 // eax = # XABORT_DEF
35772 // s1 = eax
35773 //
35774 // sinkMBB:
35775 // v = phi(s0/mainBB, s1/fallBB)
35776
35777 MachineBasicBlock *thisMBB = MBB;
35778 MachineFunction *MF = MBB->getParent();
35779 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35780 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35781 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35782 MF->insert(I, mainMBB);
35783 MF->insert(I, fallMBB);
35784 MF->insert(I, sinkMBB);
35785
35786 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35787 mainMBB->addLiveIn(X86::EFLAGS);
35788 fallMBB->addLiveIn(X86::EFLAGS);
35789 sinkMBB->addLiveIn(X86::EFLAGS);
35790 }
35791
35792 // Transfer the remainder of BB and its successor edges to sinkMBB.
35793 sinkMBB->splice(sinkMBB->begin(), MBB,
35794 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35796
35798 Register DstReg = MI.getOperand(0).getReg();
35799 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35800 Register mainDstReg = MRI.createVirtualRegister(RC);
35801 Register fallDstReg = MRI.createVirtualRegister(RC);
35802
35803 // thisMBB:
35804 // xbegin fallMBB
35805 // # fallthrough to mainMBB
35806 // # abortion to fallMBB
35807 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35808 thisMBB->addSuccessor(mainMBB);
35809 thisMBB->addSuccessor(fallMBB);
35810
35811 // mainMBB:
35812 // mainDstReg := -1
35813 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35814 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35815 mainMBB->addSuccessor(sinkMBB);
35816
35817 // fallMBB:
35818 // ; pseudo instruction to model hardware's definition from XABORT
35819 // EAX := XABORT_DEF
35820 // fallDstReg := EAX
35821 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35822 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35823 .addReg(X86::EAX);
35824 fallMBB->addSuccessor(sinkMBB);
35825
35826 // sinkMBB:
35827 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35828 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35829 .addReg(mainDstReg).addMBB(mainMBB)
35830 .addReg(fallDstReg).addMBB(fallMBB);
35831
35832 MI.eraseFromParent();
35833 return sinkMBB;
35834}
35835
35837X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35838 MachineBasicBlock *MBB) const {
35839 // Emit va_arg instruction on X86-64.
35840
35841 // Operands to this pseudo-instruction:
35842 // 0 ) Output : destination address (reg)
35843 // 1-5) Input : va_list address (addr, i64mem)
35844 // 6 ) ArgSize : Size (in bytes) of vararg type
35845 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35846 // 8 ) Align : Alignment of type
35847 // 9 ) EFLAGS (implicit-def)
35848
35849 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35850 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35851
35852 Register DestReg = MI.getOperand(0).getReg();
35853 MachineOperand &Base = MI.getOperand(1);
35854 MachineOperand &Scale = MI.getOperand(2);
35855 MachineOperand &Index = MI.getOperand(3);
35856 MachineOperand &Disp = MI.getOperand(4);
35857 MachineOperand &Segment = MI.getOperand(5);
35858 unsigned ArgSize = MI.getOperand(6).getImm();
35859 unsigned ArgMode = MI.getOperand(7).getImm();
35860 Align Alignment = Align(MI.getOperand(8).getImm());
35861
35862 MachineFunction *MF = MBB->getParent();
35863
35864 // Memory Reference
35865 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35866
35867 MachineMemOperand *OldMMO = MI.memoperands().front();
35868
35869 // Clone the MMO into two separate MMOs for loading and storing
35870 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35871 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35872 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35873 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35874
35875 // Machine Information
35876 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35877 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35878 const TargetRegisterClass *AddrRegClass =
35880 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35881 const MIMetadata MIMD(MI);
35882
35883 // struct va_list {
35884 // i32 gp_offset
35885 // i32 fp_offset
35886 // i64 overflow_area (address)
35887 // i64 reg_save_area (address)
35888 // }
35889 // sizeof(va_list) = 24
35890 // alignment(va_list) = 8
35891
35892 unsigned TotalNumIntRegs = 6;
35893 unsigned TotalNumXMMRegs = 8;
35894 bool UseGPOffset = (ArgMode == 1);
35895 bool UseFPOffset = (ArgMode == 2);
35896 unsigned MaxOffset = TotalNumIntRegs * 8 +
35897 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35898
35899 /* Align ArgSize to a multiple of 8 */
35900 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35901 bool NeedsAlign = (Alignment > 8);
35902
35903 MachineBasicBlock *thisMBB = MBB;
35904 MachineBasicBlock *overflowMBB;
35905 MachineBasicBlock *offsetMBB;
35906 MachineBasicBlock *endMBB;
35907
35908 Register OffsetDestReg; // Argument address computed by offsetMBB
35909 Register OverflowDestReg; // Argument address computed by overflowMBB
35910 Register OffsetReg;
35911
35912 if (!UseGPOffset && !UseFPOffset) {
35913 // If we only pull from the overflow region, we don't create a branch.
35914 // We don't need to alter control flow.
35915 OffsetDestReg = Register(); // unused
35916 OverflowDestReg = DestReg;
35917
35918 offsetMBB = nullptr;
35919 overflowMBB = thisMBB;
35920 endMBB = thisMBB;
35921 } else {
35922 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35923 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35924 // If not, pull from overflow_area. (branch to overflowMBB)
35925 //
35926 // thisMBB
35927 // | .
35928 // | .
35929 // offsetMBB overflowMBB
35930 // | .
35931 // | .
35932 // endMBB
35933
35934 // Registers for the PHI in endMBB
35935 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35936 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35937
35938 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35939 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35940 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35941 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35942
35944
35945 // Insert the new basic blocks
35946 MF->insert(MBBIter, offsetMBB);
35947 MF->insert(MBBIter, overflowMBB);
35948 MF->insert(MBBIter, endMBB);
35949
35950 // Transfer the remainder of MBB and its successor edges to endMBB.
35951 endMBB->splice(endMBB->begin(), thisMBB,
35952 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35953 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35954
35955 // Make offsetMBB and overflowMBB successors of thisMBB
35956 thisMBB->addSuccessor(offsetMBB);
35957 thisMBB->addSuccessor(overflowMBB);
35958
35959 // endMBB is a successor of both offsetMBB and overflowMBB
35960 offsetMBB->addSuccessor(endMBB);
35961 overflowMBB->addSuccessor(endMBB);
35962
35963 // Load the offset value into a register
35964 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35965 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35966 .add(Base)
35967 .add(Scale)
35968 .add(Index)
35969 .addDisp(Disp, UseFPOffset ? 4 : 0)
35970 .add(Segment)
35971 .setMemRefs(LoadOnlyMMO);
35972
35973 // Check if there is enough room left to pull this argument.
35974 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35975 .addReg(OffsetReg)
35976 .addImm(MaxOffset + 8 - ArgSizeA8);
35977
35978 // Branch to "overflowMBB" if offset >= max
35979 // Fall through to "offsetMBB" otherwise
35980 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35981 .addMBB(overflowMBB).addImm(X86::COND_AE);
35982 }
35983
35984 // In offsetMBB, emit code to use the reg_save_area.
35985 if (offsetMBB) {
35986 assert(OffsetReg != 0);
35987
35988 // Read the reg_save_area address.
35989 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35990 BuildMI(
35991 offsetMBB, MIMD,
35992 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35993 RegSaveReg)
35994 .add(Base)
35995 .add(Scale)
35996 .add(Index)
35997 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35998 .add(Segment)
35999 .setMemRefs(LoadOnlyMMO);
36000
36001 if (Subtarget.isTarget64BitLP64()) {
36002 // Zero-extend the offset
36003 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36004 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36005 .addImm(0)
36006 .addReg(OffsetReg)
36007 .addImm(X86::sub_32bit);
36008
36009 // Add the offset to the reg_save_area to get the final address.
36010 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
36011 .addReg(OffsetReg64)
36012 .addReg(RegSaveReg);
36013 } else {
36014 // Add the offset to the reg_save_area to get the final address.
36015 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36016 .addReg(OffsetReg)
36017 .addReg(RegSaveReg);
36018 }
36019
36020 // Compute the offset for the next argument
36021 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36022 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36023 .addReg(OffsetReg)
36024 .addImm(UseFPOffset ? 16 : 8);
36025
36026 // Store it back into the va_list.
36027 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36028 .add(Base)
36029 .add(Scale)
36030 .add(Index)
36031 .addDisp(Disp, UseFPOffset ? 4 : 0)
36032 .add(Segment)
36033 .addReg(NextOffsetReg)
36034 .setMemRefs(StoreOnlyMMO);
36035
36036 // Jump to endMBB
36037 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36038 .addMBB(endMBB);
36039 }
36040
36041 //
36042 // Emit code to use overflow area
36043 //
36044
36045 // Load the overflow_area address into a register.
36046 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36047 BuildMI(overflowMBB, MIMD,
36048 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36049 OverflowAddrReg)
36050 .add(Base)
36051 .add(Scale)
36052 .add(Index)
36053 .addDisp(Disp, 8)
36054 .add(Segment)
36055 .setMemRefs(LoadOnlyMMO);
36056
36057 // If we need to align it, do so. Otherwise, just copy the address
36058 // to OverflowDestReg.
36059 if (NeedsAlign) {
36060 // Align the overflow address
36061 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36062
36063 // aligned_addr = (addr + (align-1)) & ~(align-1)
36064 BuildMI(
36065 overflowMBB, MIMD,
36066 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36067 TmpReg)
36068 .addReg(OverflowAddrReg)
36069 .addImm(Alignment.value() - 1);
36070
36071 BuildMI(
36072 overflowMBB, MIMD,
36073 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36074 OverflowDestReg)
36075 .addReg(TmpReg)
36076 .addImm(~(uint64_t)(Alignment.value() - 1));
36077 } else {
36078 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36079 .addReg(OverflowAddrReg);
36080 }
36081
36082 // Compute the next overflow address after this argument.
36083 // (the overflow address should be kept 8-byte aligned)
36084 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36085 BuildMI(
36086 overflowMBB, MIMD,
36087 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36088 NextAddrReg)
36089 .addReg(OverflowDestReg)
36090 .addImm(ArgSizeA8);
36091
36092 // Store the new overflow address.
36093 BuildMI(overflowMBB, MIMD,
36094 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36095 .add(Base)
36096 .add(Scale)
36097 .add(Index)
36098 .addDisp(Disp, 8)
36099 .add(Segment)
36100 .addReg(NextAddrReg)
36101 .setMemRefs(StoreOnlyMMO);
36102
36103 // If we branched, emit the PHI to the front of endMBB.
36104 if (offsetMBB) {
36105 BuildMI(*endMBB, endMBB->begin(), MIMD,
36106 TII->get(X86::PHI), DestReg)
36107 .addReg(OffsetDestReg).addMBB(offsetMBB)
36108 .addReg(OverflowDestReg).addMBB(overflowMBB);
36109 }
36110
36111 // Erase the pseudo instruction
36112 MI.eraseFromParent();
36113
36114 return endMBB;
36115}
36116
36117// The EFLAGS operand of SelectItr might be missing a kill marker
36118// because there were multiple uses of EFLAGS, and ISel didn't know
36119// which to mark. Figure out whether SelectItr should have had a
36120// kill marker, and set it if it should. Returns the correct kill
36121// marker value.
36124 const TargetRegisterInfo* TRI) {
36125 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36126 return false;
36127
36128 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36129 // out. SelectMI should have a kill flag on EFLAGS.
36130 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36131 return true;
36132}
36133
36134// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36135// together with other CMOV pseudo-opcodes into a single basic-block with
36136// conditional jump around it.
36138 switch (MI.getOpcode()) {
36139 case X86::CMOV_FR16:
36140 case X86::CMOV_FR16X:
36141 case X86::CMOV_FR32:
36142 case X86::CMOV_FR32X:
36143 case X86::CMOV_FR64:
36144 case X86::CMOV_FR64X:
36145 case X86::CMOV_GR8:
36146 case X86::CMOV_GR16:
36147 case X86::CMOV_GR32:
36148 case X86::CMOV_RFP32:
36149 case X86::CMOV_RFP64:
36150 case X86::CMOV_RFP80:
36151 case X86::CMOV_VR64:
36152 case X86::CMOV_VR128:
36153 case X86::CMOV_VR128X:
36154 case X86::CMOV_VR256:
36155 case X86::CMOV_VR256X:
36156 case X86::CMOV_VR512:
36157 case X86::CMOV_VK1:
36158 case X86::CMOV_VK2:
36159 case X86::CMOV_VK4:
36160 case X86::CMOV_VK8:
36161 case X86::CMOV_VK16:
36162 case X86::CMOV_VK32:
36163 case X86::CMOV_VK64:
36164 return true;
36165
36166 default:
36167 return false;
36168 }
36169}
36170
36171// Helper function, which inserts PHI functions into SinkMBB:
36172// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36173// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36174// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36175// the last PHI function inserted.
36178 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36179 MachineBasicBlock *SinkMBB) {
36180 MachineFunction *MF = TrueMBB->getParent();
36182 const MIMetadata MIMD(*MIItBegin);
36183
36184 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36186
36187 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36188
36189 // As we are creating the PHIs, we have to be careful if there is more than
36190 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36191 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36192 // That also means that PHI construction must work forward from earlier to
36193 // later, and that the code must maintain a mapping from earlier PHI's
36194 // destination registers, and the registers that went into the PHI.
36197
36198 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36199 Register DestReg = MIIt->getOperand(0).getReg();
36200 Register Op1Reg = MIIt->getOperand(1).getReg();
36201 Register Op2Reg = MIIt->getOperand(2).getReg();
36202
36203 // If this CMOV we are generating is the opposite condition from
36204 // the jump we generated, then we have to swap the operands for the
36205 // PHI that is going to be generated.
36206 if (MIIt->getOperand(3).getImm() == OppCC)
36207 std::swap(Op1Reg, Op2Reg);
36208
36209 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36210 Op1Reg = It->second.first;
36211
36212 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36213 Op2Reg = It->second.second;
36214
36215 MIB =
36216 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36217 .addReg(Op1Reg)
36218 .addMBB(FalseMBB)
36219 .addReg(Op2Reg)
36220 .addMBB(TrueMBB);
36221
36222 // Add this PHI to the rewrite table.
36223 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36224 }
36225
36226 return MIB;
36227}
36228
36229// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36231X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36232 MachineInstr &SecondCascadedCMOV,
36233 MachineBasicBlock *ThisMBB) const {
36234 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36235 const MIMetadata MIMD(FirstCMOV);
36236
36237 // We lower cascaded CMOVs such as
36238 //
36239 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36240 //
36241 // to two successive branches.
36242 //
36243 // Without this, we would add a PHI between the two jumps, which ends up
36244 // creating a few copies all around. For instance, for
36245 //
36246 // (sitofp (zext (fcmp une)))
36247 //
36248 // we would generate:
36249 //
36250 // ucomiss %xmm1, %xmm0
36251 // movss <1.0f>, %xmm0
36252 // movaps %xmm0, %xmm1
36253 // jne .LBB5_2
36254 // xorps %xmm1, %xmm1
36255 // .LBB5_2:
36256 // jp .LBB5_4
36257 // movaps %xmm1, %xmm0
36258 // .LBB5_4:
36259 // retq
36260 //
36261 // because this custom-inserter would have generated:
36262 //
36263 // A
36264 // | \
36265 // | B
36266 // | /
36267 // C
36268 // | \
36269 // | D
36270 // | /
36271 // E
36272 //
36273 // A: X = ...; Y = ...
36274 // B: empty
36275 // C: Z = PHI [X, A], [Y, B]
36276 // D: empty
36277 // E: PHI [X, C], [Z, D]
36278 //
36279 // If we lower both CMOVs in a single step, we can instead generate:
36280 //
36281 // A
36282 // | \
36283 // | C
36284 // | /|
36285 // |/ |
36286 // | |
36287 // | D
36288 // | /
36289 // E
36290 //
36291 // A: X = ...; Y = ...
36292 // D: empty
36293 // E: PHI [X, A], [X, C], [Y, D]
36294 //
36295 // Which, in our sitofp/fcmp example, gives us something like:
36296 //
36297 // ucomiss %xmm1, %xmm0
36298 // movss <1.0f>, %xmm0
36299 // jne .LBB5_4
36300 // jp .LBB5_4
36301 // xorps %xmm0, %xmm0
36302 // .LBB5_4:
36303 // retq
36304 //
36305
36306 // We lower cascaded CMOV into two successive branches to the same block.
36307 // EFLAGS is used by both, so mark it as live in the second.
36308 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36309 MachineFunction *F = ThisMBB->getParent();
36310 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36311 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36312 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36313
36314 MachineFunction::iterator It = ++ThisMBB->getIterator();
36315 F->insert(It, FirstInsertedMBB);
36316 F->insert(It, SecondInsertedMBB);
36317 F->insert(It, SinkMBB);
36318
36319 // For a cascaded CMOV, we lower it to two successive branches to
36320 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36321 // the FirstInsertedMBB.
36322 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36323
36324 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36325 // live into the sink and copy blocks.
36326 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36327 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36328 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36329 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36330 SinkMBB->addLiveIn(X86::EFLAGS);
36331 }
36332
36333 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36334 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36335 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36336 ThisMBB->end());
36337 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36338
36339 // Fallthrough block for ThisMBB.
36340 ThisMBB->addSuccessor(FirstInsertedMBB);
36341 // The true block target of the first branch is always SinkMBB.
36342 ThisMBB->addSuccessor(SinkMBB);
36343 // Fallthrough block for FirstInsertedMBB.
36344 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36345 // The true block for the branch of FirstInsertedMBB.
36346 FirstInsertedMBB->addSuccessor(SinkMBB);
36347 // This is fallthrough.
36348 SecondInsertedMBB->addSuccessor(SinkMBB);
36349
36350 // Create the conditional branch instructions.
36351 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36352 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36353
36354 X86::CondCode SecondCC =
36355 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36356 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36357 .addMBB(SinkMBB)
36358 .addImm(SecondCC);
36359
36360 // SinkMBB:
36361 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36362 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36363 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36364 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36365 MachineInstrBuilder MIB =
36366 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36367 .addReg(Op1Reg)
36368 .addMBB(SecondInsertedMBB)
36369 .addReg(Op2Reg)
36370 .addMBB(ThisMBB);
36371
36372 // The second SecondInsertedMBB provides the same incoming value as the
36373 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36374 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36375
36376 // Now remove the CMOVs.
36377 FirstCMOV.eraseFromParent();
36378 SecondCascadedCMOV.eraseFromParent();
36379
36380 return SinkMBB;
36381}
36382
36384X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36385 MachineBasicBlock *ThisMBB) const {
36386 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36387 const MIMetadata MIMD(MI);
36388
36389 // To "insert" a SELECT_CC instruction, we actually have to insert the
36390 // diamond control-flow pattern. The incoming instruction knows the
36391 // destination vreg to set, the condition code register to branch on, the
36392 // true/false values to select between and a branch opcode to use.
36393
36394 // ThisMBB:
36395 // ...
36396 // TrueVal = ...
36397 // cmpTY ccX, r1, r2
36398 // bCC copy1MBB
36399 // fallthrough --> FalseMBB
36400
36401 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36402 // as described above, by inserting a BB, and then making a PHI at the join
36403 // point to select the true and false operands of the CMOV in the PHI.
36404 //
36405 // The code also handles two different cases of multiple CMOV opcodes
36406 // in a row.
36407 //
36408 // Case 1:
36409 // In this case, there are multiple CMOVs in a row, all which are based on
36410 // the same condition setting (or the exact opposite condition setting).
36411 // In this case we can lower all the CMOVs using a single inserted BB, and
36412 // then make a number of PHIs at the join point to model the CMOVs. The only
36413 // trickiness here, is that in a case like:
36414 //
36415 // t2 = CMOV cond1 t1, f1
36416 // t3 = CMOV cond1 t2, f2
36417 //
36418 // when rewriting this into PHIs, we have to perform some renaming on the
36419 // temps since you cannot have a PHI operand refer to a PHI result earlier
36420 // in the same block. The "simple" but wrong lowering would be:
36421 //
36422 // t2 = PHI t1(BB1), f1(BB2)
36423 // t3 = PHI t2(BB1), f2(BB2)
36424 //
36425 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36426 // renaming is to note that on the path through BB1, t2 is really just a
36427 // copy of t1, and do that renaming, properly generating:
36428 //
36429 // t2 = PHI t1(BB1), f1(BB2)
36430 // t3 = PHI t1(BB1), f2(BB2)
36431 //
36432 // Case 2:
36433 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36434 // function - EmitLoweredCascadedSelect.
36435
36436 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36438 MachineInstr *LastCMOV = &MI;
36440
36441 // Check for case 1, where there are multiple CMOVs with the same condition
36442 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36443 // number of jumps the most.
36444
36445 if (isCMOVPseudo(MI)) {
36446 // See if we have a string of CMOVS with the same condition. Skip over
36447 // intervening debug insts.
36448 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36449 (NextMIIt->getOperand(3).getImm() == CC ||
36450 NextMIIt->getOperand(3).getImm() == OppCC)) {
36451 LastCMOV = &*NextMIIt;
36452 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36453 }
36454 }
36455
36456 // This checks for case 2, but only do this if we didn't already find
36457 // case 1, as indicated by LastCMOV == MI.
36458 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36459 NextMIIt->getOpcode() == MI.getOpcode() &&
36460 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36461 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36462 NextMIIt->getOperand(1).isKill()) {
36463 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36464 }
36465
36466 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36467 MachineFunction *F = ThisMBB->getParent();
36468 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36469 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36470
36471 MachineFunction::iterator It = ++ThisMBB->getIterator();
36472 F->insert(It, FalseMBB);
36473 F->insert(It, SinkMBB);
36474
36475 // Set the call frame size on entry to the new basic blocks.
36476 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36477 FalseMBB->setCallFrameSize(CallFrameSize);
36478 SinkMBB->setCallFrameSize(CallFrameSize);
36479
36480 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36481 // live into the sink and copy blocks.
36482 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36483 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36484 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36485 FalseMBB->addLiveIn(X86::EFLAGS);
36486 SinkMBB->addLiveIn(X86::EFLAGS);
36487 }
36488
36489 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36491 MachineBasicBlock::iterator(LastCMOV));
36492 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36493 if (MI.isDebugInstr())
36494 SinkMBB->push_back(MI.removeFromParent());
36495
36496 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36497 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36498 std::next(MachineBasicBlock::iterator(LastCMOV)),
36499 ThisMBB->end());
36500 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36501
36502 // Fallthrough block for ThisMBB.
36503 ThisMBB->addSuccessor(FalseMBB);
36504 // The true block target of the first (or only) branch is always a SinkMBB.
36505 ThisMBB->addSuccessor(SinkMBB);
36506 // Fallthrough block for FalseMBB.
36507 FalseMBB->addSuccessor(SinkMBB);
36508
36509 // Create the conditional branch instruction.
36510 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36511
36512 // SinkMBB:
36513 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36514 // ...
36517 std::next(MachineBasicBlock::iterator(LastCMOV));
36518 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36519
36520 // Now remove the CMOV(s).
36521 ThisMBB->erase(MIItBegin, MIItEnd);
36522
36523 return SinkMBB;
36524}
36525
36526static unsigned getSUBriOpcode(bool IsLP64) {
36527 if (IsLP64)
36528 return X86::SUB64ri32;
36529 else
36530 return X86::SUB32ri;
36531}
36532
36534X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36535 MachineBasicBlock *MBB) const {
36536 MachineFunction *MF = MBB->getParent();
36537 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36538 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36539 const MIMetadata MIMD(MI);
36540 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36541
36542 const unsigned ProbeSize = getStackProbeSize(*MF);
36543
36544 MachineRegisterInfo &MRI = MF->getRegInfo();
36545 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36546 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36547 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36548
36550 MF->insert(MBBIter, testMBB);
36551 MF->insert(MBBIter, blockMBB);
36552 MF->insert(MBBIter, tailMBB);
36553
36554 Register sizeVReg = MI.getOperand(1).getReg();
36555
36556 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36557
36558 Register TmpStackPtr = MRI.createVirtualRegister(
36559 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36560 Register FinalStackPtr = MRI.createVirtualRegister(
36561 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36562
36563 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36564 .addReg(physSPReg);
36565 {
36566 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36567 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36568 .addReg(TmpStackPtr)
36569 .addReg(sizeVReg);
36570 }
36571
36572 // test rsp size
36573
36574 BuildMI(testMBB, MIMD,
36575 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36576 .addReg(FinalStackPtr)
36577 .addReg(physSPReg);
36578
36579 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36580 .addMBB(tailMBB)
36582 testMBB->addSuccessor(blockMBB);
36583 testMBB->addSuccessor(tailMBB);
36584
36585 // Touch the block then extend it. This is done on the opposite side of
36586 // static probe where we allocate then touch, to avoid the need of probing the
36587 // tail of the static alloca. Possible scenarios are:
36588 //
36589 // + ---- <- ------------ <- ------------- <- ------------ +
36590 // | |
36591 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36592 // | |
36593 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36594 //
36595 // The property we want to enforce is to never have more than [page alloc] between two probes.
36596
36597 const unsigned XORMIOpc =
36598 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36599 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36600 .addImm(0);
36601
36602 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36603 physSPReg)
36604 .addReg(physSPReg)
36605 .addImm(ProbeSize);
36606
36607 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36608 blockMBB->addSuccessor(testMBB);
36609
36610 // Replace original instruction by the expected stack ptr
36611 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36612 MI.getOperand(0).getReg())
36613 .addReg(FinalStackPtr);
36614
36615 tailMBB->splice(tailMBB->end(), MBB,
36616 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36618 MBB->addSuccessor(testMBB);
36619
36620 // Delete the original pseudo instruction.
36621 MI.eraseFromParent();
36622
36623 // And we're done.
36624 return tailMBB;
36625}
36626
36628X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36629 MachineBasicBlock *BB) const {
36630 MachineFunction *MF = BB->getParent();
36631 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36632 const MIMetadata MIMD(MI);
36633 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36634
36635 assert(MF->shouldSplitStack());
36636
36637 const bool Is64Bit = Subtarget.is64Bit();
36638 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36639
36640 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36641 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36642
36643 // BB:
36644 // ... [Till the alloca]
36645 // If stacklet is not large enough, jump to mallocMBB
36646 //
36647 // bumpMBB:
36648 // Allocate by subtracting from RSP
36649 // Jump to continueMBB
36650 //
36651 // mallocMBB:
36652 // Allocate by call to runtime
36653 //
36654 // continueMBB:
36655 // ...
36656 // [rest of original BB]
36657 //
36658
36659 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36660 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36661 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36662
36663 MachineRegisterInfo &MRI = MF->getRegInfo();
36664 const TargetRegisterClass *AddrRegClass =
36666
36667 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36668 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36669 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36670 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36671 sizeVReg = MI.getOperand(1).getReg(),
36672 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36673
36674 MachineFunction::iterator MBBIter = ++BB->getIterator();
36675
36676 MF->insert(MBBIter, bumpMBB);
36677 MF->insert(MBBIter, mallocMBB);
36678 MF->insert(MBBIter, continueMBB);
36679
36680 continueMBB->splice(continueMBB->begin(), BB,
36681 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36682 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36683
36684 // Add code to the main basic block to check if the stack limit has been hit,
36685 // and if so, jump to mallocMBB otherwise to bumpMBB.
36686 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36687 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36688 .addReg(tmpSPVReg).addReg(sizeVReg);
36689 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36690 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36691 .addReg(SPLimitVReg);
36692 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36693
36694 // bumpMBB simply decreases the stack pointer, since we know the current
36695 // stacklet has enough space.
36696 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36697 .addReg(SPLimitVReg);
36698 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36699 .addReg(SPLimitVReg);
36700 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36701
36702 // Calls into a routine in libgcc to allocate more space from the heap.
36703 const uint32_t *RegMask =
36704 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36705 if (IsLP64) {
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36707 .addReg(sizeVReg);
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36709 .addExternalSymbol("__morestack_allocate_stack_space")
36710 .addRegMask(RegMask)
36711 .addReg(X86::RDI, RegState::Implicit)
36712 .addReg(X86::RAX, RegState::ImplicitDefine);
36713 } else if (Is64Bit) {
36714 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36715 .addReg(sizeVReg);
36716 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36717 .addExternalSymbol("__morestack_allocate_stack_space")
36718 .addRegMask(RegMask)
36719 .addReg(X86::EDI, RegState::Implicit)
36720 .addReg(X86::EAX, RegState::ImplicitDefine);
36721 } else {
36722 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36723 .addImm(12);
36724 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36725 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36726 .addExternalSymbol("__morestack_allocate_stack_space")
36727 .addRegMask(RegMask)
36728 .addReg(X86::EAX, RegState::ImplicitDefine);
36729 }
36730
36731 if (!Is64Bit)
36732 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36733 .addImm(16);
36734
36735 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36736 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36737 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36738
36739 // Set up the CFG correctly.
36740 BB->addSuccessor(bumpMBB);
36741 BB->addSuccessor(mallocMBB);
36742 mallocMBB->addSuccessor(continueMBB);
36743 bumpMBB->addSuccessor(continueMBB);
36744
36745 // Take care of the PHI nodes.
36746 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36747 MI.getOperand(0).getReg())
36748 .addReg(mallocPtrVReg)
36749 .addMBB(mallocMBB)
36750 .addReg(bumpSPPtrVReg)
36751 .addMBB(bumpMBB);
36752
36753 // Delete the original pseudo instruction.
36754 MI.eraseFromParent();
36755
36756 // And we're done.
36757 return continueMBB;
36758}
36759
36761X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36762 MachineBasicBlock *BB) const {
36763 MachineFunction *MF = BB->getParent();
36764 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36765 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36766 const MIMetadata MIMD(MI);
36767
36770 "SEH does not use catchret!");
36771
36772 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36773 if (!Subtarget.is32Bit())
36774 return BB;
36775
36776 // C++ EH creates a new target block to hold the restore code, and wires up
36777 // the new block to the return destination with a normal JMP_4.
36778 MachineBasicBlock *RestoreMBB =
36780 assert(BB->succ_size() == 1);
36781 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36782 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36783 BB->addSuccessor(RestoreMBB);
36784 MI.getOperand(0).setMBB(RestoreMBB);
36785
36786 // Marking this as an EH pad but not a funclet entry block causes PEI to
36787 // restore stack pointers in the block.
36788 RestoreMBB->setIsEHPad(true);
36789
36790 auto RestoreMBBI = RestoreMBB->begin();
36791 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36792 return BB;
36793}
36794
36796X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36797 MachineBasicBlock *BB) const {
36798 // This is pretty easy. We're taking the value that we received from
36799 // our load from the relocation, sticking it in either RDI (x86-64)
36800 // or EAX and doing an indirect call. The return value will then
36801 // be in the normal return register.
36802 MachineFunction *F = BB->getParent();
36803 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36804 const MIMetadata MIMD(MI);
36805
36806 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36807 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36808
36809 // Get a register mask for the lowered call.
36810 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36811 // proper register mask.
36812 const uint32_t *RegMask =
36813 Subtarget.is64Bit() ?
36814 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36815 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36816 if (Subtarget.is64Bit()) {
36817 MachineInstrBuilder MIB =
36818 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36819 .addReg(X86::RIP)
36820 .addImm(0)
36821 .addReg(0)
36822 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36823 MI.getOperand(3).getTargetFlags())
36824 .addReg(0);
36825 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36826 addDirectMem(MIB, X86::RDI);
36827 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36828 } else if (!isPositionIndependent()) {
36829 MachineInstrBuilder MIB =
36830 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36831 .addReg(0)
36832 .addImm(0)
36833 .addReg(0)
36834 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36835 MI.getOperand(3).getTargetFlags())
36836 .addReg(0);
36837 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36838 addDirectMem(MIB, X86::EAX);
36839 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36840 } else {
36841 MachineInstrBuilder MIB =
36842 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36843 .addReg(TII->getGlobalBaseReg(F))
36844 .addImm(0)
36845 .addReg(0)
36846 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36847 MI.getOperand(3).getTargetFlags())
36848 .addReg(0);
36849 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36850 addDirectMem(MIB, X86::EAX);
36851 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36852 }
36853
36854 MI.eraseFromParent(); // The pseudo instruction is gone now.
36855 return BB;
36856}
36857
36858static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36859 switch (RPOpc) {
36860 case X86::INDIRECT_THUNK_CALL32:
36861 return X86::CALLpcrel32;
36862 case X86::INDIRECT_THUNK_CALL64:
36863 return X86::CALL64pcrel32;
36864 case X86::INDIRECT_THUNK_TCRETURN32:
36865 return X86::TCRETURNdi;
36866 case X86::INDIRECT_THUNK_TCRETURN64:
36867 return X86::TCRETURNdi64;
36868 }
36869 llvm_unreachable("not indirect thunk opcode");
36870}
36871
36872static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36873 Register Reg) {
36874 if (Subtarget.useRetpolineExternalThunk()) {
36875 // When using an external thunk for retpolines, we pick names that match the
36876 // names GCC happens to use as well. This helps simplify the implementation
36877 // of the thunks for kernels where they have no easy ability to create
36878 // aliases and are doing non-trivial configuration of the thunk's body. For
36879 // example, the Linux kernel will do boot-time hot patching of the thunk
36880 // bodies and cannot easily export aliases of these to loaded modules.
36881 //
36882 // Note that at any point in the future, we may need to change the semantics
36883 // of how we implement retpolines and at that time will likely change the
36884 // name of the called thunk. Essentially, there is no hard guarantee that
36885 // LLVM will generate calls to specific thunks, we merely make a best-effort
36886 // attempt to help out kernels and other systems where duplicating the
36887 // thunks is costly.
36888 switch (Reg.id()) {
36889 case X86::EAX:
36890 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36891 return "__x86_indirect_thunk_eax";
36892 case X86::ECX:
36893 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36894 return "__x86_indirect_thunk_ecx";
36895 case X86::EDX:
36896 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36897 return "__x86_indirect_thunk_edx";
36898 case X86::EDI:
36899 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36900 return "__x86_indirect_thunk_edi";
36901 case X86::R11:
36902 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36903 return "__x86_indirect_thunk_r11";
36904 }
36905 llvm_unreachable("unexpected reg for external indirect thunk");
36906 }
36907
36908 if (Subtarget.useRetpolineIndirectCalls() ||
36909 Subtarget.useRetpolineIndirectBranches()) {
36910 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36911 switch (Reg.id()) {
36912 case X86::EAX:
36913 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36914 return "__llvm_retpoline_eax";
36915 case X86::ECX:
36916 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36917 return "__llvm_retpoline_ecx";
36918 case X86::EDX:
36919 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36920 return "__llvm_retpoline_edx";
36921 case X86::EDI:
36922 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36923 return "__llvm_retpoline_edi";
36924 case X86::R11:
36925 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36926 return "__llvm_retpoline_r11";
36927 }
36928 llvm_unreachable("unexpected reg for retpoline");
36929 }
36930
36931 if (Subtarget.useLVIControlFlowIntegrity()) {
36932 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36933 return "__llvm_lvi_thunk_r11";
36934 }
36935 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36936}
36937
36939X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36940 MachineBasicBlock *BB) const {
36941 // Copy the virtual register into the R11 physical register and
36942 // call the retpoline thunk.
36943 const MIMetadata MIMD(MI);
36944 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36945 Register CalleeVReg = MI.getOperand(0).getReg();
36946 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36947
36948 // Find an available scratch register to hold the callee. On 64-bit, we can
36949 // just use R11, but we scan for uses anyway to ensure we don't generate
36950 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36951 // already a register use operand to the call to hold the callee. If none
36952 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36953 // register and ESI is the base pointer to realigned stack frames with VLAs.
36954 SmallVector<Register, 3> AvailableRegs;
36955 if (Subtarget.is64Bit())
36956 AvailableRegs.push_back(X86::R11);
36957 else
36958 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36959
36960 // Zero out any registers that are already used.
36961 for (const auto &MO : MI.operands()) {
36962 if (MO.isReg() && MO.isUse())
36963 llvm::replace(AvailableRegs, MO.getReg(), Register());
36964 }
36965
36966 // Choose the first remaining non-zero available register.
36967 Register AvailableReg;
36968 for (Register MaybeReg : AvailableRegs) {
36969 if (MaybeReg) {
36970 AvailableReg = MaybeReg;
36971 break;
36972 }
36973 }
36974 if (!AvailableReg)
36975 report_fatal_error("calling convention incompatible with retpoline, no "
36976 "available registers");
36977
36978 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36979
36980 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36981 .addReg(CalleeVReg);
36982 MI.getOperand(0).ChangeToES(Symbol);
36983 MI.setDesc(TII->get(Opc));
36984 MachineInstrBuilder(*BB->getParent(), &MI)
36985 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36986 return BB;
36987}
36988
36989/// SetJmp implies future control flow change upon calling the corresponding
36990/// LongJmp.
36991/// Instead of using the 'return' instruction, the long jump fixes the stack and
36992/// performs an indirect branch. To do so it uses the registers that were stored
36993/// in the jump buffer (when calling SetJmp).
36994/// In case the shadow stack is enabled we need to fix it as well, because some
36995/// return addresses will be skipped.
36996/// The function will save the SSP for future fixing in the function
36997/// emitLongJmpShadowStackFix.
36998/// \sa emitLongJmpShadowStackFix
36999/// \param [in] MI The temporary Machine Instruction for the builtin.
37000/// \param [in] MBB The Machine Basic Block that will be modified.
37001void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37002 MachineBasicBlock *MBB) const {
37003 const MIMetadata MIMD(MI);
37004 MachineFunction *MF = MBB->getParent();
37005 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37006 MachineRegisterInfo &MRI = MF->getRegInfo();
37007 MachineInstrBuilder MIB;
37008
37009 // Memory Reference.
37010 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37011
37012 // Initialize a register with zero.
37013 MVT PVT = getPointerTy(MF->getDataLayout());
37014 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37015 Register ZReg = MRI.createVirtualRegister(PtrRC);
37016 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37017 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37018 .addDef(ZReg)
37019 .addReg(ZReg, RegState::Undef)
37020 .addReg(ZReg, RegState::Undef);
37021
37022 // Read the current SSP Register value to the zeroed register.
37023 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37024 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37025 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37026
37027 // Write the SSP register value to offset 3 in input memory buffer.
37028 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37029 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37030 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37031 const unsigned MemOpndSlot = 1;
37032 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37033 if (i == X86::AddrDisp)
37034 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37035 else
37036 MIB.add(MI.getOperand(MemOpndSlot + i));
37037 }
37038 MIB.addReg(SSPCopyReg);
37039 MIB.setMemRefs(MMOs);
37040}
37041
37043X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37044 MachineBasicBlock *MBB) const {
37045 const MIMetadata MIMD(MI);
37046 MachineFunction *MF = MBB->getParent();
37047 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37048 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37049 MachineRegisterInfo &MRI = MF->getRegInfo();
37050
37051 const BasicBlock *BB = MBB->getBasicBlock();
37053
37054 // Memory Reference
37055 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37056
37057 unsigned MemOpndSlot = 0;
37058
37059 unsigned CurOp = 0;
37060
37061 Register DstReg = MI.getOperand(CurOp++).getReg();
37062 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37063 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37064 (void)TRI;
37065 Register mainDstReg = MRI.createVirtualRegister(RC);
37066 Register restoreDstReg = MRI.createVirtualRegister(RC);
37067
37068 MemOpndSlot = CurOp;
37069
37070 MVT PVT = getPointerTy(MF->getDataLayout());
37071 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37072 "Invalid Pointer Size!");
37073
37074 // For v = setjmp(buf), we generate
37075 //
37076 // thisMBB:
37077 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37078 // SjLjSetup restoreMBB
37079 //
37080 // mainMBB:
37081 // v_main = 0
37082 //
37083 // sinkMBB:
37084 // v = phi(main, restore)
37085 //
37086 // restoreMBB:
37087 // if base pointer being used, load it from frame
37088 // v_restore = 1
37089
37090 MachineBasicBlock *thisMBB = MBB;
37091 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37092 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37093 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37094 MF->insert(I, mainMBB);
37095 MF->insert(I, sinkMBB);
37096 MF->push_back(restoreMBB);
37097 restoreMBB->setMachineBlockAddressTaken();
37098
37099 MachineInstrBuilder MIB;
37100
37101 // Transfer the remainder of BB and its successor edges to sinkMBB.
37102 sinkMBB->splice(sinkMBB->begin(), MBB,
37103 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37105
37106 // thisMBB:
37107 unsigned PtrStoreOpc = 0;
37108 Register LabelReg;
37109 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37110 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37112
37113 // Prepare IP either in reg or imm.
37114 if (!UseImmLabel) {
37115 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37116 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37117 LabelReg = MRI.createVirtualRegister(PtrRC);
37118 if (Subtarget.is64Bit()) {
37119 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37120 .addReg(X86::RIP)
37121 .addImm(0)
37122 .addReg(0)
37123 .addMBB(restoreMBB)
37124 .addReg(0);
37125 } else {
37126 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37127 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37128 .addReg(XII->getGlobalBaseReg(MF))
37129 .addImm(0)
37130 .addReg(0)
37131 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37132 .addReg(0);
37133 }
37134 } else
37135 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37136 // Store IP
37137 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37138 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37139 if (i == X86::AddrDisp)
37140 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37141 else
37142 MIB.add(MI.getOperand(MemOpndSlot + i));
37143 }
37144 if (!UseImmLabel)
37145 MIB.addReg(LabelReg);
37146 else
37147 MIB.addMBB(restoreMBB);
37148 MIB.setMemRefs(MMOs);
37149
37150 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37151 emitSetJmpShadowStackFix(MI, thisMBB);
37152 }
37153
37154 // Setup
37155 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37156 .addMBB(restoreMBB);
37157
37158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37159 MIB.addRegMask(RegInfo->getNoPreservedMask());
37160 thisMBB->addSuccessor(mainMBB);
37161 thisMBB->addSuccessor(restoreMBB);
37162
37163 // mainMBB:
37164 // EAX = 0
37165 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37166 mainMBB->addSuccessor(sinkMBB);
37167
37168 // sinkMBB:
37169 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37170 .addReg(mainDstReg)
37171 .addMBB(mainMBB)
37172 .addReg(restoreDstReg)
37173 .addMBB(restoreMBB);
37174
37175 // restoreMBB:
37176 if (RegInfo->hasBasePointer(*MF)) {
37177 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37178 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37179 X86FI->setRestoreBasePointer(MF);
37180 Register FramePtr = RegInfo->getFrameRegister(*MF);
37181 Register BasePtr = RegInfo->getBaseRegister();
37182 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37183 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37184 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37186 }
37187 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37188 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37189 restoreMBB->addSuccessor(sinkMBB);
37190
37191 MI.eraseFromParent();
37192 return sinkMBB;
37193}
37194
37195/// Fix the shadow stack using the previously saved SSP pointer.
37196/// \sa emitSetJmpShadowStackFix
37197/// \param [in] MI The temporary Machine Instruction for the builtin.
37198/// \param [in] MBB The Machine Basic Block that will be modified.
37199/// \return The sink MBB that will perform the future indirect branch.
37201X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37202 MachineBasicBlock *MBB) const {
37203 const MIMetadata MIMD(MI);
37204 MachineFunction *MF = MBB->getParent();
37205 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37206 MachineRegisterInfo &MRI = MF->getRegInfo();
37207
37208 // Memory Reference
37209 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37210
37211 MVT PVT = getPointerTy(MF->getDataLayout());
37212 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37213
37214 // checkSspMBB:
37215 // xor vreg1, vreg1
37216 // rdssp vreg1
37217 // test vreg1, vreg1
37218 // je sinkMBB # Jump if Shadow Stack is not supported
37219 // fallMBB:
37220 // mov buf+24/12(%rip), vreg2
37221 // sub vreg1, vreg2
37222 // jbe sinkMBB # No need to fix the Shadow Stack
37223 // fixShadowMBB:
37224 // shr 3/2, vreg2
37225 // incssp vreg2 # fix the SSP according to the lower 8 bits
37226 // shr 8, vreg2
37227 // je sinkMBB
37228 // fixShadowLoopPrepareMBB:
37229 // shl vreg2
37230 // mov 128, vreg3
37231 // fixShadowLoopMBB:
37232 // incssp vreg3
37233 // dec vreg2
37234 // jne fixShadowLoopMBB # Iterate until you finish fixing
37235 // # the Shadow Stack
37236 // sinkMBB:
37237
37239 const BasicBlock *BB = MBB->getBasicBlock();
37240
37241 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37242 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37243 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37244 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37245 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37246 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37247 MF->insert(I, checkSspMBB);
37248 MF->insert(I, fallMBB);
37249 MF->insert(I, fixShadowMBB);
37250 MF->insert(I, fixShadowLoopPrepareMBB);
37251 MF->insert(I, fixShadowLoopMBB);
37252 MF->insert(I, sinkMBB);
37253
37254 // Transfer the remainder of BB and its successor edges to sinkMBB.
37255 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37256 MBB->end());
37258
37259 MBB->addSuccessor(checkSspMBB);
37260
37261 // Initialize a register with zero.
37262 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37263 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37264
37265 if (PVT == MVT::i64) {
37266 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37267 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37268 .addImm(0)
37269 .addReg(ZReg)
37270 .addImm(X86::sub_32bit);
37271 ZReg = TmpZReg;
37272 }
37273
37274 // Read the current SSP Register value to the zeroed register.
37275 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37276 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37277 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37278
37279 // Check whether the result of the SSP register is zero and jump directly
37280 // to the sink.
37281 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37282 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37283 .addReg(SSPCopyReg)
37284 .addReg(SSPCopyReg);
37285 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37286 .addMBB(sinkMBB)
37288 checkSspMBB->addSuccessor(sinkMBB);
37289 checkSspMBB->addSuccessor(fallMBB);
37290
37291 // Reload the previously saved SSP register value.
37292 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37293 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37294 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37295 MachineInstrBuilder MIB =
37296 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37297 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37298 const MachineOperand &MO = MI.getOperand(i);
37299 if (i == X86::AddrDisp)
37300 MIB.addDisp(MO, SPPOffset);
37301 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37302 // preserve kill flags.
37303 MIB.addReg(MO.getReg());
37304 else
37305 MIB.add(MO);
37306 }
37307 MIB.setMemRefs(MMOs);
37308
37309 // Subtract the current SSP from the previous SSP.
37310 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37311 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37312 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37313 .addReg(PrevSSPReg)
37314 .addReg(SSPCopyReg);
37315
37316 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37317 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37318 .addMBB(sinkMBB)
37320 fallMBB->addSuccessor(sinkMBB);
37321 fallMBB->addSuccessor(fixShadowMBB);
37322
37323 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37324 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37325 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37326 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37327 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37328 .addReg(SspSubReg)
37329 .addImm(Offset);
37330
37331 // Increase SSP when looking only on the lower 8 bits of the delta.
37332 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37333 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37334
37335 // Reset the lower 8 bits.
37336 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37337 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37338 .addReg(SspFirstShrReg)
37339 .addImm(8);
37340
37341 // Jump if the result of the shift is zero.
37342 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37343 .addMBB(sinkMBB)
37345 fixShadowMBB->addSuccessor(sinkMBB);
37346 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37347
37348 // Do a single shift left.
37349 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37350 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37351 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37352 .addReg(SspSecondShrReg)
37353 .addImm(1);
37354
37355 // Save the value 128 to a register (will be used next with incssp).
37356 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37357 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37358 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37359 .addImm(128);
37360 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37361
37362 // Since incssp only looks at the lower 8 bits, we might need to do several
37363 // iterations of incssp until we finish fixing the shadow stack.
37364 Register DecReg = MRI.createVirtualRegister(PtrRC);
37365 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37366 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37367 .addReg(SspAfterShlReg)
37368 .addMBB(fixShadowLoopPrepareMBB)
37369 .addReg(DecReg)
37370 .addMBB(fixShadowLoopMBB);
37371
37372 // Every iteration we increase the SSP by 128.
37373 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37374
37375 // Every iteration we decrement the counter by 1.
37376 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37377 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37378
37379 // Jump if the counter is not zero yet.
37380 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37381 .addMBB(fixShadowLoopMBB)
37383 fixShadowLoopMBB->addSuccessor(sinkMBB);
37384 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37385
37386 return sinkMBB;
37387}
37388
37390X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37391 MachineBasicBlock *MBB) const {
37392 const MIMetadata MIMD(MI);
37393 MachineFunction *MF = MBB->getParent();
37394 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37395 MachineRegisterInfo &MRI = MF->getRegInfo();
37396
37397 // Memory Reference
37398 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37399
37400 MVT PVT = getPointerTy(MF->getDataLayout());
37401 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37402 "Invalid Pointer Size!");
37403
37404 const TargetRegisterClass *RC =
37405 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37406 Register Tmp = MRI.createVirtualRegister(RC);
37407 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37408 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37409 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37410 Register SP = RegInfo->getStackRegister();
37411
37412 MachineInstrBuilder MIB;
37413
37414 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37415 const int64_t SPOffset = 2 * PVT.getStoreSize();
37416
37417 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37418 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37419
37420 MachineBasicBlock *thisMBB = MBB;
37421
37422 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37423 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37424 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37425 }
37426
37427 // Reload FP
37428 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37429 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37430 const MachineOperand &MO = MI.getOperand(i);
37431 if (MO.isReg()) // Don't add the whole operand, we don't want to
37432 // preserve kill flags.
37433 MIB.addReg(MO.getReg());
37434 else
37435 MIB.add(MO);
37436 }
37437 MIB.setMemRefs(MMOs);
37439
37440 // Reload IP
37441 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37442 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37443 const MachineOperand &MO = MI.getOperand(i);
37444 if (i == X86::AddrDisp)
37445 MIB.addDisp(MO, LabelOffset);
37446 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37447 // preserve kill flags.
37448 MIB.addReg(MO.getReg());
37449 else
37450 MIB.add(MO);
37451 }
37452 MIB.setMemRefs(MMOs);
37453
37454 // Reload SP
37455 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37456 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37457 if (i == X86::AddrDisp)
37458 MIB.addDisp(MI.getOperand(i), SPOffset);
37459 else
37460 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37461 // the last instruction of the expansion.
37462 }
37463 MIB.setMemRefs(MMOs);
37465
37466 // Jump
37467 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37468
37469 MI.eraseFromParent();
37470 return thisMBB;
37471}
37472
37473void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37475 MachineBasicBlock *DispatchBB,
37476 int FI) const {
37477 const MIMetadata MIMD(MI);
37478 MachineFunction *MF = MBB->getParent();
37479 MachineRegisterInfo *MRI = &MF->getRegInfo();
37480 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37481
37482 MVT PVT = getPointerTy(MF->getDataLayout());
37483 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37484
37485 unsigned Op = 0;
37486 Register VR;
37487
37488 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37490
37491 if (UseImmLabel) {
37492 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37493 } else {
37494 const TargetRegisterClass *TRC =
37495 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37496 VR = MRI->createVirtualRegister(TRC);
37497 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37498
37499 if (Subtarget.is64Bit())
37500 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37501 .addReg(X86::RIP)
37502 .addImm(1)
37503 .addReg(0)
37504 .addMBB(DispatchBB)
37505 .addReg(0);
37506 else
37507 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37508 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37509 .addImm(1)
37510 .addReg(0)
37511 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37512 .addReg(0);
37513 }
37514
37515 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37516 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37517 if (UseImmLabel)
37518 MIB.addMBB(DispatchBB);
37519 else
37520 MIB.addReg(VR);
37521}
37522
37524X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37525 MachineBasicBlock *BB) const {
37526 const MIMetadata MIMD(MI);
37527 MachineFunction *MF = BB->getParent();
37528 MachineRegisterInfo *MRI = &MF->getRegInfo();
37529 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37530 int FI = MF->getFrameInfo().getFunctionContextIndex();
37531
37532 // Get a mapping of the call site numbers to all of the landing pads they're
37533 // associated with.
37534 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37535 unsigned MaxCSNum = 0;
37536 for (auto &MBB : *MF) {
37537 if (!MBB.isEHPad())
37538 continue;
37539
37540 MCSymbol *Sym = nullptr;
37541 for (const auto &MI : MBB) {
37542 if (MI.isDebugInstr())
37543 continue;
37544
37545 assert(MI.isEHLabel() && "expected EH_LABEL");
37546 Sym = MI.getOperand(0).getMCSymbol();
37547 break;
37548 }
37549
37550 if (!MF->hasCallSiteLandingPad(Sym))
37551 continue;
37552
37553 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37554 CallSiteNumToLPad[CSI].push_back(&MBB);
37555 MaxCSNum = std::max(MaxCSNum, CSI);
37556 }
37557 }
37558
37559 // Get an ordered list of the machine basic blocks for the jump table.
37560 std::vector<MachineBasicBlock *> LPadList;
37561 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37562 LPadList.reserve(CallSiteNumToLPad.size());
37563
37564 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37565 for (auto &LP : CallSiteNumToLPad[CSI]) {
37566 LPadList.push_back(LP);
37567 InvokeBBs.insert_range(LP->predecessors());
37568 }
37569 }
37570
37571 assert(!LPadList.empty() &&
37572 "No landing pad destinations for the dispatch jump table!");
37573
37574 // Create the MBBs for the dispatch code.
37575
37576 // Shove the dispatch's address into the return slot in the function context.
37577 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37578 DispatchBB->setIsEHPad(true);
37579
37580 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37581 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37582 DispatchBB->addSuccessor(TrapBB);
37583
37584 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37585 DispatchBB->addSuccessor(DispContBB);
37586
37587 // Insert MBBs.
37588 MF->push_back(DispatchBB);
37589 MF->push_back(DispContBB);
37590 MF->push_back(TrapBB);
37591
37592 // Insert code into the entry block that creates and registers the function
37593 // context.
37594 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37595
37596 // Create the jump table and associated information
37597 unsigned JTE = getJumpTableEncoding();
37598 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37599 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37600
37601 const X86RegisterInfo &RI = TII->getRegisterInfo();
37602 // Add a register mask with no preserved registers. This results in all
37603 // registers being marked as clobbered.
37604 if (RI.hasBasePointer(*MF)) {
37605 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37606 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37607 MFI->setRestoreBasePointer(MF);
37608
37609 Register FP = RI.getFrameRegister(*MF);
37610 Register BP = RI.getBaseRegister();
37611 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37612 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37615 } else {
37616 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37618 }
37619
37620 // IReg is used as an index in a memory operand and therefore can't be SP
37621 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37622 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37623 Subtarget.is64Bit() ? 8 : 4);
37624 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37625 .addReg(IReg)
37626 .addImm(LPadList.size());
37627 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37628 .addMBB(TrapBB)
37630
37631 if (Subtarget.is64Bit()) {
37632 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37633 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37634
37635 // leaq .LJTI0_0(%rip), BReg
37636 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37637 .addReg(X86::RIP)
37638 .addImm(1)
37639 .addReg(0)
37640 .addJumpTableIndex(MJTI)
37641 .addReg(0);
37642 // movzx IReg64, IReg
37643 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37644 .addImm(0)
37645 .addReg(IReg)
37646 .addImm(X86::sub_32bit);
37647
37648 switch (JTE) {
37650 // jmpq *(BReg,IReg64,8)
37651 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37652 .addReg(BReg)
37653 .addImm(8)
37654 .addReg(IReg64)
37655 .addImm(0)
37656 .addReg(0);
37657 break;
37659 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37660 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37661 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37662
37663 // movl (BReg,IReg64,4), OReg
37664 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37665 .addReg(BReg)
37666 .addImm(4)
37667 .addReg(IReg64)
37668 .addImm(0)
37669 .addReg(0);
37670 // movsx OReg64, OReg
37671 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37672 .addReg(OReg);
37673 // addq BReg, OReg64, TReg
37674 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37675 .addReg(OReg64)
37676 .addReg(BReg);
37677 // jmpq *TReg
37678 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37679 break;
37680 }
37681 default:
37682 llvm_unreachable("Unexpected jump table encoding");
37683 }
37684 } else {
37685 // jmpl *.LJTI0_0(,IReg,4)
37686 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37687 .addReg(0)
37688 .addImm(4)
37689 .addReg(IReg)
37690 .addJumpTableIndex(MJTI)
37691 .addReg(0);
37692 }
37693
37694 // Add the jump table entries as successors to the MBB.
37695 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37696 for (auto &LP : LPadList)
37697 if (SeenMBBs.insert(LP).second)
37698 DispContBB->addSuccessor(LP);
37699
37700 // N.B. the order the invoke BBs are processed in doesn't matter here.
37702 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37703 for (MachineBasicBlock *MBB : InvokeBBs) {
37704 // Remove the landing pad successor from the invoke block and replace it
37705 // with the new dispatch block.
37706 // Keep a copy of Successors since it's modified inside the loop.
37707 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37708 MBB->succ_rend());
37709 // FIXME: Avoid quadratic complexity.
37710 for (auto *MBBS : Successors) {
37711 if (MBBS->isEHPad()) {
37712 MBB->removeSuccessor(MBBS);
37713 MBBLPads.push_back(MBBS);
37714 }
37715 }
37716
37717 MBB->addSuccessor(DispatchBB);
37718
37719 // Find the invoke call and mark all of the callee-saved registers as
37720 // 'implicit defined' so that they're spilled. This prevents code from
37721 // moving instructions to before the EH block, where they will never be
37722 // executed.
37723 for (auto &II : reverse(*MBB)) {
37724 if (!II.isCall())
37725 continue;
37726
37727 DenseSet<Register> DefRegs;
37728 for (auto &MOp : II.operands())
37729 if (MOp.isReg())
37730 DefRegs.insert(MOp.getReg());
37731
37732 MachineInstrBuilder MIB(*MF, &II);
37733 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37734 Register Reg = SavedRegs[RegIdx];
37735 if (!DefRegs.contains(Reg))
37737 }
37738
37739 break;
37740 }
37741 }
37742
37743 // Mark all former landing pads as non-landing pads. The dispatch is the only
37744 // landing pad now.
37745 for (auto &LP : MBBLPads)
37746 LP->setIsEHPad(false);
37747
37748 // The instruction is gone now.
37749 MI.eraseFromParent();
37750 return BB;
37751}
37752
37754X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37755 MachineBasicBlock *BB) const {
37756 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37757 // calls may require proper stack alignment.
37758 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37759 const MIMetadata MIMD(MI);
37760 MachineFunction &MF = *BB->getParent();
37761
37762 // Emit CALLSEQ_START right before the instruction.
37763 MF.getFrameInfo().setAdjustsStack(true);
37764 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37765 MachineInstrBuilder CallseqStart =
37766 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37767 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37768
37769 // Emit CALLSEQ_END right after the instruction.
37770 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37771 MachineInstrBuilder CallseqEnd =
37772 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37773 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37774
37775 return BB;
37776}
37777
37780 MachineBasicBlock *BB) const {
37781 MachineFunction *MF = BB->getParent();
37782 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37783 const MIMetadata MIMD(MI);
37784
37785 auto TMMImmToTMMReg = [](unsigned Imm) {
37786 assert (Imm < 8 && "Illegal tmm index");
37787 return X86::TMM0 + Imm;
37788 };
37789 auto TMMImmToTMMPair = [](unsigned Imm) {
37790 assert(Imm < 8 && "Illegal tmm pair index.");
37791 return X86::TMM0_TMM1 + Imm / 2;
37792 };
37793 switch (MI.getOpcode()) {
37794 default:
37795 llvm_unreachable("Unexpected instr type to insert");
37796 case X86::INDIRECT_THUNK_CALL32:
37797 case X86::INDIRECT_THUNK_CALL64:
37798 case X86::INDIRECT_THUNK_TCRETURN32:
37799 case X86::INDIRECT_THUNK_TCRETURN64:
37800 return EmitLoweredIndirectThunk(MI, BB);
37801 case X86::CATCHRET:
37802 return EmitLoweredCatchRet(MI, BB);
37803 case X86::SEG_ALLOCA_32:
37804 case X86::SEG_ALLOCA_64:
37805 return EmitLoweredSegAlloca(MI, BB);
37806 case X86::PROBED_ALLOCA_32:
37807 case X86::PROBED_ALLOCA_64:
37808 return EmitLoweredProbedAlloca(MI, BB);
37809 case X86::TLSCall_32:
37810 case X86::TLSCall_64:
37811 return EmitLoweredTLSCall(MI, BB);
37812 case X86::CMOV_FR16:
37813 case X86::CMOV_FR16X:
37814 case X86::CMOV_FR32:
37815 case X86::CMOV_FR32X:
37816 case X86::CMOV_FR64:
37817 case X86::CMOV_FR64X:
37818 case X86::CMOV_GR8:
37819 case X86::CMOV_GR16:
37820 case X86::CMOV_GR32:
37821 case X86::CMOV_RFP32:
37822 case X86::CMOV_RFP64:
37823 case X86::CMOV_RFP80:
37824 case X86::CMOV_VR64:
37825 case X86::CMOV_VR128:
37826 case X86::CMOV_VR128X:
37827 case X86::CMOV_VR256:
37828 case X86::CMOV_VR256X:
37829 case X86::CMOV_VR512:
37830 case X86::CMOV_VK1:
37831 case X86::CMOV_VK2:
37832 case X86::CMOV_VK4:
37833 case X86::CMOV_VK8:
37834 case X86::CMOV_VK16:
37835 case X86::CMOV_VK32:
37836 case X86::CMOV_VK64:
37837 return EmitLoweredSelect(MI, BB);
37838
37839 case X86::FP80_ADDr:
37840 case X86::FP80_ADDm32: {
37841 // Change the floating point control register to use double extended
37842 // precision when performing the addition.
37843 int OrigCWFrameIdx =
37844 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37845 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37846 OrigCWFrameIdx);
37847
37848 // Load the old value of the control word...
37849 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37850 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37851 OrigCWFrameIdx);
37852
37853 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37854 // precision.
37855 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37856 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37857 .addReg(OldCW, RegState::Kill)
37858 .addImm(0x300);
37859
37860 // Extract to 16 bits.
37861 Register NewCW16 =
37862 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37863 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37864 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37865
37866 // Prepare memory for FLDCW.
37867 int NewCWFrameIdx =
37868 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37869 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37870 NewCWFrameIdx)
37871 .addReg(NewCW16, RegState::Kill);
37872
37873 // Reload the modified control word now...
37874 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37875 NewCWFrameIdx);
37876
37877 // Do the addition.
37878 if (MI.getOpcode() == X86::FP80_ADDr) {
37879 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37880 .add(MI.getOperand(0))
37881 .add(MI.getOperand(1))
37882 .add(MI.getOperand(2));
37883 } else {
37884 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37885 .add(MI.getOperand(0))
37886 .add(MI.getOperand(1))
37887 .add(MI.getOperand(2))
37888 .add(MI.getOperand(3))
37889 .add(MI.getOperand(4))
37890 .add(MI.getOperand(5))
37891 .add(MI.getOperand(6));
37892 }
37893
37894 // Reload the original control word now.
37895 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37896 OrigCWFrameIdx);
37897
37898 MI.eraseFromParent(); // The pseudo instruction is gone now.
37899 return BB;
37900 }
37901
37902 case X86::FP32_TO_INT16_IN_MEM:
37903 case X86::FP32_TO_INT32_IN_MEM:
37904 case X86::FP32_TO_INT64_IN_MEM:
37905 case X86::FP64_TO_INT16_IN_MEM:
37906 case X86::FP64_TO_INT32_IN_MEM:
37907 case X86::FP64_TO_INT64_IN_MEM:
37908 case X86::FP80_TO_INT16_IN_MEM:
37909 case X86::FP80_TO_INT32_IN_MEM:
37910 case X86::FP80_TO_INT64_IN_MEM: {
37911 // Change the floating point control register to use "round towards zero"
37912 // mode when truncating to an integer value.
37913 int OrigCWFrameIdx =
37914 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37915 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37916 OrigCWFrameIdx);
37917
37918 // Load the old value of the control word...
37919 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37920 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37921 OrigCWFrameIdx);
37922
37923 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37924 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37925 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37926 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37927
37928 // Extract to 16 bits.
37929 Register NewCW16 =
37930 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37931 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37932 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37933
37934 // Prepare memory for FLDCW.
37935 int NewCWFrameIdx =
37936 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37937 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37938 NewCWFrameIdx)
37939 .addReg(NewCW16, RegState::Kill);
37940
37941 // Reload the modified control word now...
37942 addFrameReference(BuildMI(*BB, MI, MIMD,
37943 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37944
37945 // Get the X86 opcode to use.
37946 unsigned Opc;
37947 switch (MI.getOpcode()) {
37948 // clang-format off
37949 default: llvm_unreachable("illegal opcode!");
37950 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37951 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37952 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37953 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37954 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37955 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37956 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37957 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37958 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37959 // clang-format on
37960 }
37961
37963 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37964 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37965
37966 // Reload the original control word now.
37967 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37968 OrigCWFrameIdx);
37969
37970 MI.eraseFromParent(); // The pseudo instruction is gone now.
37971 return BB;
37972 }
37973
37974 // xbegin
37975 case X86::XBEGIN:
37976 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37977
37978 case X86::VAARG_64:
37979 case X86::VAARG_X32:
37980 return EmitVAARGWithCustomInserter(MI, BB);
37981
37982 case X86::EH_SjLj_SetJmp32:
37983 case X86::EH_SjLj_SetJmp64:
37984 return emitEHSjLjSetJmp(MI, BB);
37985
37986 case X86::EH_SjLj_LongJmp32:
37987 case X86::EH_SjLj_LongJmp64:
37988 return emitEHSjLjLongJmp(MI, BB);
37989
37990 case X86::Int_eh_sjlj_setup_dispatch:
37991 return EmitSjLjDispatchBlock(MI, BB);
37992
37993 case TargetOpcode::STATEPOINT:
37994 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37995 // this point in the process. We diverge later.
37996 return emitPatchPoint(MI, BB);
37997
37998 case TargetOpcode::STACKMAP:
37999 case TargetOpcode::PATCHPOINT:
38000 return emitPatchPoint(MI, BB);
38001
38002 case TargetOpcode::PATCHABLE_EVENT_CALL:
38003 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38004 return emitPatchableEventCall(MI, BB);
38005
38006 case X86::LCMPXCHG8B: {
38007 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38008 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38009 // requires a memory operand. If it happens that current architecture is
38010 // i686 and for current function we need a base pointer
38011 // - which is ESI for i686 - register allocator would not be able to
38012 // allocate registers for an address in form of X(%reg, %reg, Y)
38013 // - there never would be enough unreserved registers during regalloc
38014 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38015 // We are giving a hand to register allocator by precomputing the address in
38016 // a new vreg using LEA.
38017
38018 // If it is not i686 or there is no base pointer - nothing to do here.
38019 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38020 return BB;
38021
38022 // Even though this code does not necessarily needs the base pointer to
38023 // be ESI, we check for that. The reason: if this assert fails, there are
38024 // some changes happened in the compiler base pointer handling, which most
38025 // probably have to be addressed somehow here.
38026 assert(TRI->getBaseRegister() == X86::ESI &&
38027 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38028 "base pointer in mind");
38029
38031 MVT SPTy = getPointerTy(MF->getDataLayout());
38032 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38033 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38034
38036 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38037 // does not use index register.
38038 if (AM.IndexReg == X86::NoRegister)
38039 return BB;
38040
38041 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38042 // four operand definitions that are E[ABCD] registers. We skip them and
38043 // then insert the LEA.
38044 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38045 while (RMBBI != BB->rend() &&
38046 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38047 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38048 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38049 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38050 ++RMBBI;
38051 }
38054 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38055
38056 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38057
38058 return BB;
38059 }
38060 case X86::LCMPXCHG16B_NO_RBX: {
38061 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38062 Register BasePtr = TRI->getBaseRegister();
38063 if (TRI->hasBasePointer(*MF) &&
38064 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38065 if (!BB->isLiveIn(BasePtr))
38066 BB->addLiveIn(BasePtr);
38067 // Save RBX into a virtual register.
38068 Register SaveRBX =
38069 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38070 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38071 .addReg(X86::RBX);
38072 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38074 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38075 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38076 MIB.add(MI.getOperand(Idx));
38077 MIB.add(MI.getOperand(X86::AddrNumOperands));
38078 MIB.addReg(SaveRBX);
38079 } else {
38080 // Simple case, just copy the virtual register to RBX.
38081 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38082 .add(MI.getOperand(X86::AddrNumOperands));
38084 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38085 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38086 MIB.add(MI.getOperand(Idx));
38087 }
38088 MI.eraseFromParent();
38089 return BB;
38090 }
38091 case X86::MWAITX: {
38092 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38093 Register BasePtr = TRI->getBaseRegister();
38094 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38095 // If no need to save the base pointer, we generate MWAITXrrr,
38096 // else we generate pseudo MWAITX_SAVE_RBX.
38097 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38098 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38099 .addReg(MI.getOperand(0).getReg());
38100 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38101 .addReg(MI.getOperand(1).getReg());
38102 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38103 .addReg(MI.getOperand(2).getReg());
38104 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38105 MI.eraseFromParent();
38106 } else {
38107 if (!BB->isLiveIn(BasePtr)) {
38108 BB->addLiveIn(BasePtr);
38109 }
38110 // Parameters can be copied into ECX and EAX but not EBX yet.
38111 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38112 .addReg(MI.getOperand(0).getReg());
38113 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38114 .addReg(MI.getOperand(1).getReg());
38115 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38116 // Save RBX into a virtual register.
38117 Register SaveRBX =
38118 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38119 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38120 .addReg(X86::RBX);
38121 // Generate mwaitx pseudo.
38122 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38123 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38124 .addDef(Dst) // Destination tied in with SaveRBX.
38125 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38126 .addUse(SaveRBX); // Save of base pointer.
38127 MI.eraseFromParent();
38128 }
38129 return BB;
38130 }
38131 case TargetOpcode::PREALLOCATED_SETUP: {
38132 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38133 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38134 MFI->setHasPreallocatedCall(true);
38135 int64_t PreallocatedId = MI.getOperand(0).getImm();
38136 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38137 assert(StackAdjustment != 0 && "0 stack adjustment");
38138 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38139 << StackAdjustment << "\n");
38140 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38141 .addReg(X86::ESP)
38142 .addImm(StackAdjustment);
38143 MI.eraseFromParent();
38144 return BB;
38145 }
38146 case TargetOpcode::PREALLOCATED_ARG: {
38147 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38148 int64_t PreallocatedId = MI.getOperand(1).getImm();
38149 int64_t ArgIdx = MI.getOperand(2).getImm();
38150 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38151 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38152 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38153 << ", arg offset " << ArgOffset << "\n");
38154 // stack pointer + offset
38155 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38156 MI.getOperand(0).getReg()),
38157 X86::ESP, false, ArgOffset);
38158 MI.eraseFromParent();
38159 return BB;
38160 }
38161 case X86::PTDPBSSD:
38162 case X86::PTDPBSUD:
38163 case X86::PTDPBUSD:
38164 case X86::PTDPBUUD:
38165 case X86::PTDPBF16PS:
38166 case X86::PTDPFP16PS:
38167 case X86::PTCMMIMFP16PS:
38168 case X86::PTCMMRLFP16PS:
38169 case X86::PTDPBF8PS:
38170 case X86::PTDPBHF8PS:
38171 case X86::PTDPHBF8PS:
38172 case X86::PTDPHF8PS:
38173 case X86::PTTDPBF16PS:
38174 case X86::PTTDPFP16PS:
38175 case X86::PTTCMMIMFP16PS:
38176 case X86::PTTCMMRLFP16PS:
38177 case X86::PTCONJTCMMIMFP16PS:
38178 case X86::PTMMULTF32PS:
38179 case X86::PTTMMULTF32PS: {
38180 unsigned Opc;
38181 switch (MI.getOpcode()) {
38182 default: llvm_unreachable("illegal opcode!");
38183 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38184 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38185 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38186 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38187 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38188 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38189 case X86::PTCMMIMFP16PS:
38190 Opc = X86::TCMMIMFP16PS;
38191 break;
38192 case X86::PTCMMRLFP16PS:
38193 Opc = X86::TCMMRLFP16PS;
38194 break;
38195 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38196 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38197 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38198 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38199 case X86::PTTDPBF16PS:
38200 Opc = X86::TTDPBF16PS;
38201 break;
38202 case X86::PTTDPFP16PS:
38203 Opc = X86::TTDPFP16PS;
38204 break;
38205 case X86::PTTCMMIMFP16PS:
38206 Opc = X86::TTCMMIMFP16PS;
38207 break;
38208 case X86::PTTCMMRLFP16PS:
38209 Opc = X86::TTCMMRLFP16PS;
38210 break;
38211 case X86::PTCONJTCMMIMFP16PS:
38212 Opc = X86::TCONJTCMMIMFP16PS;
38213 break;
38214 case X86::PTMMULTF32PS:
38215 Opc = X86::TMMULTF32PS;
38216 break;
38217 case X86::PTTMMULTF32PS:
38218 Opc = X86::TTMMULTF32PS;
38219 break;
38220 }
38221
38222 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38223 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38224 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38225 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38226 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38227
38228 MI.eraseFromParent(); // The pseudo is gone now.
38229 return BB;
38230 }
38231 case X86::PTILEZERO: {
38232 unsigned Imm = MI.getOperand(0).getImm();
38233 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38234 MI.eraseFromParent(); // The pseudo is gone now.
38235 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38237 return BB;
38238 }
38239 case X86::PTILEZEROV: {
38240 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38242 return BB;
38243 }
38244 case X86::PTILELOADDRS:
38245 case X86::PTILELOADDRST1:
38246 case X86::PTILELOADD:
38247 case X86::PTILELOADDT1:
38248 case X86::PTILESTORED: {
38249 unsigned Opc;
38250 switch (MI.getOpcode()) {
38251 default: llvm_unreachable("illegal opcode!");
38252#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38253 case X86::PTILELOADD:
38254 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38255 break;
38256 case X86::PTILELOADDT1:
38257 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38258 break;
38259 case X86::PTILESTORED:
38260 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38261 break;
38262 case X86::PTILELOADDRS:
38263 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38264 break;
38265 case X86::PTILELOADDRST1:
38266 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38267 break;
38268 }
38269#undef GET_EGPR_IF_ENABLED
38270
38271 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38272 unsigned CurOp = 0;
38273 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38274 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38276
38277 MIB.add(MI.getOperand(CurOp++)); // base
38278 MIB.add(MI.getOperand(CurOp++)); // scale
38279 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38280 MIB.add(MI.getOperand(CurOp++)); // displacement
38281 MIB.add(MI.getOperand(CurOp++)); // segment
38282
38283 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38284 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38286
38287 MI.eraseFromParent(); // The pseudo is gone now.
38288 return BB;
38289 }
38290 case X86::PT2RPNTLVWZ0:
38291 case X86::PT2RPNTLVWZ0T1:
38292 case X86::PT2RPNTLVWZ1:
38293 case X86::PT2RPNTLVWZ1T1:
38294 case X86::PT2RPNTLVWZ0RS:
38295 case X86::PT2RPNTLVWZ0RST1:
38296 case X86::PT2RPNTLVWZ1RS:
38297 case X86::PT2RPNTLVWZ1RST1: {
38298 const DebugLoc &DL = MI.getDebugLoc();
38299 unsigned Opc;
38300#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38301 switch (MI.getOpcode()) {
38302 default:
38303 llvm_unreachable("Unexpected instruction!");
38304 case X86::PT2RPNTLVWZ0:
38305 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38306 break;
38307 case X86::PT2RPNTLVWZ0T1:
38308 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38309 break;
38310 case X86::PT2RPNTLVWZ1:
38311 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38312 break;
38313 case X86::PT2RPNTLVWZ1T1:
38314 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38315 break;
38316 case X86::PT2RPNTLVWZ0RS:
38317 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38318 break;
38319 case X86::PT2RPNTLVWZ0RST1:
38320 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38321 break;
38322 case X86::PT2RPNTLVWZ1RS:
38323 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38324 break;
38325 case X86::PT2RPNTLVWZ1RST1:
38326 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38327 break;
38328 }
38329#undef GET_EGPR_IF_ENABLED
38330 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38331 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38332
38333 MIB.add(MI.getOperand(1)); // base
38334 MIB.add(MI.getOperand(2)); // scale
38335 MIB.add(MI.getOperand(3)); // index
38336 MIB.add(MI.getOperand(4)); // displacement
38337 MIB.add(MI.getOperand(5)); // segment
38338 MI.eraseFromParent(); // The pseudo is gone now.
38339 return BB;
38340 }
38341 case X86::PTTRANSPOSED:
38342 case X86::PTCONJTFP16: {
38343 const DebugLoc &DL = MI.getDebugLoc();
38344 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38345 : X86::TCONJTFP16;
38346
38347 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38348 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38349 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38350
38351 MI.eraseFromParent(); // The pseudo is gone now.
38352 return BB;
38353 }
38354 case X86::PTCVTROWPS2BF16Hrri:
38355 case X86::PTCVTROWPS2BF16Lrri:
38356 case X86::PTCVTROWPS2PHHrri:
38357 case X86::PTCVTROWPS2PHLrri:
38358 case X86::PTCVTROWD2PSrri:
38359 case X86::PTILEMOVROWrri: {
38360 const DebugLoc &DL = MI.getDebugLoc();
38361 unsigned Opc;
38362 switch (MI.getOpcode()) {
38363 default:
38364 llvm_unreachable("Unexpected instruction!");
38365 case X86::PTCVTROWD2PSrri:
38366 Opc = X86::TCVTROWD2PSrri;
38367 break;
38368 case X86::PTCVTROWPS2BF16Hrri:
38369 Opc = X86::TCVTROWPS2BF16Hrri;
38370 break;
38371 case X86::PTCVTROWPS2PHHrri:
38372 Opc = X86::TCVTROWPS2PHHrri;
38373 break;
38374 case X86::PTCVTROWPS2BF16Lrri:
38375 Opc = X86::TCVTROWPS2BF16Lrri;
38376 break;
38377 case X86::PTCVTROWPS2PHLrri:
38378 Opc = X86::TCVTROWPS2PHLrri;
38379 break;
38380 case X86::PTILEMOVROWrri:
38381 Opc = X86::TILEMOVROWrri;
38382 break;
38383 }
38384 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38385 MIB.add(MI.getOperand(0));
38386 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38387 MIB.addImm(MI.getOperand(2).getImm());
38388
38389 MI.eraseFromParent(); // The pseudo is gone now.
38390 return BB;
38391 }
38392 case X86::PTCVTROWPS2BF16Hrre:
38393 case X86::PTCVTROWPS2BF16Lrre:
38394 case X86::PTCVTROWPS2PHHrre:
38395 case X86::PTCVTROWPS2PHLrre:
38396 case X86::PTCVTROWD2PSrre:
38397 case X86::PTILEMOVROWrre: {
38398 const DebugLoc &DL = MI.getDebugLoc();
38399 unsigned Opc;
38400 switch (MI.getOpcode()) {
38401 default:
38402 llvm_unreachable("Unexpected instruction!");
38403 case X86::PTCVTROWD2PSrre:
38404 Opc = X86::TCVTROWD2PSrre;
38405 break;
38406 case X86::PTCVTROWPS2BF16Hrre:
38407 Opc = X86::TCVTROWPS2BF16Hrre;
38408 break;
38409 case X86::PTCVTROWPS2BF16Lrre:
38410 Opc = X86::TCVTROWPS2BF16Lrre;
38411 break;
38412 case X86::PTCVTROWPS2PHHrre:
38413 Opc = X86::TCVTROWPS2PHHrre;
38414 break;
38415 case X86::PTCVTROWPS2PHLrre:
38416 Opc = X86::TCVTROWPS2PHLrre;
38417 break;
38418 case X86::PTILEMOVROWrre:
38419 Opc = X86::TILEMOVROWrre;
38420 break;
38421 }
38422 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38423 MIB.add(MI.getOperand(0));
38424 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38425 MIB.add(MI.getOperand(2));
38426
38427 MI.eraseFromParent(); // The pseudo is gone now.
38428 return BB;
38429 }
38430 }
38431}
38432
38433//===----------------------------------------------------------------------===//
38434// X86 Optimization Hooks
38435//===----------------------------------------------------------------------===//
38436
38437bool
38439 const APInt &DemandedBits,
38440 const APInt &DemandedElts,
38441 TargetLoweringOpt &TLO) const {
38442 EVT VT = Op.getValueType();
38443 unsigned Opcode = Op.getOpcode();
38444 unsigned EltSize = VT.getScalarSizeInBits();
38445
38446 if (VT.isVector()) {
38447 // If the constant is only all signbits in the active bits, then we should
38448 // extend it to the entire constant to allow it act as a boolean constant
38449 // vector.
38450 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38451 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38452 return false;
38453 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38454 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38455 continue;
38456 const APInt &Val = V.getConstantOperandAPInt(i);
38457 if (Val.getBitWidth() > Val.getNumSignBits() &&
38458 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38459 return true;
38460 }
38461 return false;
38462 };
38463 // For vectors - if we have a constant, then try to sign extend.
38464 // TODO: Handle AND cases.
38465 unsigned ActiveBits = DemandedBits.getActiveBits();
38466 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38467 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38468 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38469 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38470 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38472 SDValue NewC =
38474 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38475 SDValue NewOp =
38476 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38477 return TLO.CombineTo(Op, NewOp);
38478 }
38479 return false;
38480 }
38481
38482 // Only optimize Ands to prevent shrinking a constant that could be
38483 // matched by movzx.
38484 if (Opcode != ISD::AND)
38485 return false;
38486
38487 // Make sure the RHS really is a constant.
38488 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38489 if (!C)
38490 return false;
38491
38492 const APInt &Mask = C->getAPIntValue();
38493
38494 // Clear all non-demanded bits initially.
38495 APInt ShrunkMask = Mask & DemandedBits;
38496
38497 // Find the width of the shrunk mask.
38498 unsigned Width = ShrunkMask.getActiveBits();
38499
38500 // If the mask is all 0s there's nothing to do here.
38501 if (Width == 0)
38502 return false;
38503
38504 // Find the next power of 2 width, rounding up to a byte.
38505 Width = llvm::bit_ceil(std::max(Width, 8U));
38506 // Truncate the width to size to handle illegal types.
38507 Width = std::min(Width, EltSize);
38508
38509 // Calculate a possible zero extend mask for this constant.
38510 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38511
38512 // If we aren't changing the mask, just return true to keep it and prevent
38513 // the caller from optimizing.
38514 if (ZeroExtendMask == Mask)
38515 return true;
38516
38517 // Make sure the new mask can be represented by a combination of mask bits
38518 // and non-demanded bits.
38519 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38520 return false;
38521
38522 // Replace the constant with the zero extend mask.
38523 SDLoc DL(Op);
38524 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38525 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38526 return TLO.CombineTo(Op, NewOp);
38527}
38528
38530 KnownBits &Known,
38531 const APInt &DemandedElts,
38532 const SelectionDAG &DAG, unsigned Depth) {
38533 KnownBits Known2;
38534 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38535 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38536 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38537 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38538 Known = KnownBits::abdu(Known, Known2).zext(16);
38539 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38540 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38541 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38542 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38543 Known = Known.zext(64);
38544}
38545
38547 KnownBits &Known,
38548 const APInt &DemandedElts,
38549 const SelectionDAG &DAG,
38550 unsigned Depth) {
38551 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38552
38553 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38554 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38555 APInt DemandedLoElts =
38556 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38557 APInt DemandedHiElts =
38558 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38559 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38560 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38561 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38562 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38563 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38564 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38565 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38566}
38567
38569 KnownBits &Known,
38570 const APInt &DemandedElts,
38571 const SelectionDAG &DAG,
38572 unsigned Depth) {
38573 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38574
38575 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38576 // pairs.
38577 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38578 APInt DemandedLoElts =
38579 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38580 APInt DemandedHiElts =
38581 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38582 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38583 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38584 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38585 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38586 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38587 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38588 Known = KnownBits::sadd_sat(Lo, Hi);
38589}
38590
38592 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38593 const SelectionDAG &DAG,
38594 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38595 KnownBitsFunc) {
38596 APInt DemandedEltsLHS, DemandedEltsRHS;
38597 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38598 DemandedElts, DemandedEltsLHS,
38599 DemandedEltsRHS);
38600
38601 const auto ComputeForSingleOpFunc =
38602 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38603 return KnownBitsFunc(
38604 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38605 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38606 };
38607
38608 if (DemandedEltsRHS.isZero())
38609 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38610 if (DemandedEltsLHS.isZero())
38611 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38612
38613 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38614 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38615}
38616
38618 KnownBits &Known,
38619 const APInt &DemandedElts,
38620 const SelectionDAG &DAG,
38621 unsigned Depth) const {
38622 unsigned BitWidth = Known.getBitWidth();
38623 unsigned NumElts = DemandedElts.getBitWidth();
38624 unsigned Opc = Op.getOpcode();
38625 EVT VT = Op.getValueType();
38630 "Should use MaskedValueIsZero if you don't know whether Op"
38631 " is a target node!");
38632
38633 Known.resetAll();
38634 switch (Opc) {
38635 default: break;
38636 case X86ISD::MUL_IMM: {
38637 KnownBits Known2;
38638 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38639 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38640 Known = KnownBits::mul(Known, Known2);
38641 break;
38642 }
38643 case X86ISD::BSF: {
38645
38646 KnownBits Known2;
38647 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38648 if (Known2.isNonZero()) {
38649 // If we have a known 1, its position is our upper bound.
38650 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38651 unsigned LowBits = llvm::bit_width(PossibleTZ);
38652 Known.Zero.setBitsFrom(LowBits);
38653 } else if (!Op.getOperand(0).isUndef()) {
38654 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38655 Known = Known.intersectWith(Known2);
38656 }
38657 break;
38658 }
38659 case X86ISD::BSR: {
38660 // TODO: Bound with input known bits?
38662
38663 if (!Op.getOperand(0).isUndef() &&
38664 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38665 KnownBits Known2;
38666 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38667 Known = Known.intersectWith(Known2);
38668 }
38669 break;
38670 }
38671 case X86ISD::SETCC:
38672 Known.Zero.setBitsFrom(1);
38673 break;
38674 case X86ISD::MOVMSK: {
38675 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38676 Known.Zero.setBitsFrom(NumLoBits);
38677 break;
38678 }
38679 case X86ISD::PEXTRB:
38680 case X86ISD::PEXTRW: {
38681 SDValue Src = Op.getOperand(0);
38682 EVT SrcVT = Src.getValueType();
38683 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38684 Op.getConstantOperandVal(1));
38685 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38686 Known = Known.anyextOrTrunc(BitWidth);
38687 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38688 break;
38689 }
38690 case X86ISD::VSRAI:
38691 case X86ISD::VSHLI:
38692 case X86ISD::VSRLI: {
38693 unsigned ShAmt = Op.getConstantOperandVal(1);
38694 if (ShAmt >= VT.getScalarSizeInBits()) {
38695 // Out of range logical bit shifts are guaranteed to be zero.
38696 // Out of range arithmetic bit shifts splat the sign bit.
38697 if (Opc != X86ISD::VSRAI) {
38698 Known.setAllZero();
38699 break;
38700 }
38701
38702 ShAmt = VT.getScalarSizeInBits() - 1;
38703 }
38704
38705 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38706 if (Opc == X86ISD::VSHLI) {
38707 Known <<= ShAmt;
38708 // Low bits are known zero.
38709 Known.Zero.setLowBits(ShAmt);
38710 } else if (Opc == X86ISD::VSRLI) {
38711 Known >>= ShAmt;
38712 // High bits are known zero.
38713 Known.Zero.setHighBits(ShAmt);
38714 } else {
38715 Known.Zero.ashrInPlace(ShAmt);
38716 Known.One.ashrInPlace(ShAmt);
38717 }
38718 break;
38719 }
38720 case X86ISD::PACKUS: {
38721 // PACKUS is just a truncation if the upper half is zero.
38722 APInt DemandedLHS, DemandedRHS;
38723 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38724
38725 Known.One = APInt::getAllOnes(BitWidth * 2);
38726 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38727
38728 KnownBits Known2;
38729 if (!!DemandedLHS) {
38730 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38731 Known = Known.intersectWith(Known2);
38732 }
38733 if (!!DemandedRHS) {
38734 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38735 Known = Known.intersectWith(Known2);
38736 }
38737
38738 if (Known.countMinLeadingZeros() < BitWidth)
38739 Known.resetAll();
38740 Known = Known.trunc(BitWidth);
38741 break;
38742 }
38743 case X86ISD::PSHUFB: {
38744 SDValue Src = Op.getOperand(0);
38745 SDValue Idx = Op.getOperand(1);
38746
38747 // If the index vector is never negative (MSB is zero), then all elements
38748 // come from the source vector. This is useful for cases where
38749 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38750 // below will handle the more common constant shuffle mask case.
38751 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38752 if (KnownIdx.isNonNegative())
38753 Known = DAG.computeKnownBits(Src, Depth + 1);
38754 break;
38755 }
38756 case X86ISD::VBROADCAST: {
38757 SDValue Src = Op.getOperand(0);
38758 if (!Src.getSimpleValueType().isVector()) {
38759 Known = DAG.computeKnownBits(Src, Depth + 1);
38760 return;
38761 }
38762 break;
38763 }
38764 case X86ISD::AND: {
38765 if (Op.getResNo() == 0) {
38766 KnownBits Known2;
38767 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38768 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38769 Known &= Known2;
38770 }
38771 break;
38772 }
38773 case X86ISD::ANDNP: {
38774 KnownBits Known2;
38775 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38776 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38777
38778 // ANDNP = (~X & Y);
38779 Known.One &= Known2.Zero;
38780 Known.Zero |= Known2.One;
38781 break;
38782 }
38783 case X86ISD::FOR: {
38784 KnownBits Known2;
38785 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38786 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38787
38788 Known |= Known2;
38789 break;
38790 }
38791 case X86ISD::PSADBW: {
38792 SDValue LHS = Op.getOperand(0);
38793 SDValue RHS = Op.getOperand(1);
38794 assert(VT.getScalarType() == MVT::i64 &&
38795 LHS.getValueType() == RHS.getValueType() &&
38796 LHS.getValueType().getScalarType() == MVT::i8 &&
38797 "Unexpected PSADBW types");
38798 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38799 break;
38800 }
38801 case X86ISD::PCMPGT:
38802 case X86ISD::PCMPEQ: {
38803 KnownBits KnownLhs =
38804 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38805 KnownBits KnownRhs =
38806 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38807 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38808 ? KnownBits::eq(KnownLhs, KnownRhs)
38809 : KnownBits::sgt(KnownLhs, KnownRhs);
38810 if (Res) {
38811 if (*Res)
38812 Known.setAllOnes();
38813 else
38814 Known.setAllZero();
38815 }
38816 break;
38817 }
38818 case X86ISD::VPMADDWD: {
38819 SDValue LHS = Op.getOperand(0);
38820 SDValue RHS = Op.getOperand(1);
38821 assert(VT.getVectorElementType() == MVT::i32 &&
38822 LHS.getValueType() == RHS.getValueType() &&
38823 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38824 "Unexpected PMADDWD types");
38825 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38826 break;
38827 }
38828 case X86ISD::VPMADDUBSW: {
38829 SDValue LHS = Op.getOperand(0);
38830 SDValue RHS = Op.getOperand(1);
38831 assert(VT.getVectorElementType() == MVT::i16 &&
38832 LHS.getValueType() == RHS.getValueType() &&
38833 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38834 "Unexpected PMADDUBSW types");
38835 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38836 break;
38837 }
38838 case X86ISD::PMULUDQ: {
38839 KnownBits Known2;
38840 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38841 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38842
38843 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38844 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38845 Known = KnownBits::mul(Known, Known2);
38846 break;
38847 }
38848 case X86ISD::CMOV: {
38849 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38850 // If we don't know any bits, early out.
38851 if (Known.isUnknown())
38852 break;
38853 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38854
38855 // Only known if known in both the LHS and RHS.
38856 Known = Known.intersectWith(Known2);
38857 break;
38858 }
38859 case X86ISD::BEXTR:
38860 case X86ISD::BEXTRI: {
38861 SDValue Op0 = Op.getOperand(0);
38862 SDValue Op1 = Op.getOperand(1);
38863
38864 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38865 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38866 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38867
38868 // If the length is 0, the result is 0.
38869 if (Length == 0) {
38870 Known.setAllZero();
38871 break;
38872 }
38873
38874 if ((Shift + Length) <= BitWidth) {
38875 Known = DAG.computeKnownBits(Op0, Depth + 1);
38876 Known = Known.extractBits(Length, Shift);
38877 Known = Known.zextOrTrunc(BitWidth);
38878 }
38879 }
38880 break;
38881 }
38882 case X86ISD::PDEP: {
38883 KnownBits Known2;
38884 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38885 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38886 // Zeros are retained from the mask operand. But not ones.
38887 Known.One.clearAllBits();
38888 // The result will have at least as many trailing zeros as the non-mask
38889 // operand since bits can only map to the same or higher bit position.
38890 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38891 break;
38892 }
38893 case X86ISD::PEXT: {
38894 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38895 // The result has as many leading zeros as the number of zeroes in the mask.
38896 unsigned Count = Known.Zero.popcount();
38898 Known.One.clearAllBits();
38899 break;
38900 }
38901 case X86ISD::VTRUNC:
38902 case X86ISD::VTRUNCS:
38903 case X86ISD::VTRUNCUS:
38904 case X86ISD::CVTSI2P:
38905 case X86ISD::CVTUI2P:
38906 case X86ISD::CVTP2SI:
38907 case X86ISD::CVTP2UI:
38908 case X86ISD::MCVTP2SI:
38909 case X86ISD::MCVTP2UI:
38910 case X86ISD::CVTTP2SI:
38911 case X86ISD::CVTTP2UI:
38912 case X86ISD::MCVTTP2SI:
38913 case X86ISD::MCVTTP2UI:
38914 case X86ISD::MCVTSI2P:
38915 case X86ISD::MCVTUI2P:
38916 case X86ISD::VFPROUND:
38917 case X86ISD::VMFPROUND:
38918 case X86ISD::CVTPS2PH:
38919 case X86ISD::MCVTPS2PH:
38920 case X86ISD::MCVTTP2SIS:
38921 case X86ISD::MCVTTP2UIS: {
38922 // Truncations/Conversions - upper elements are known zero.
38923 EVT SrcVT = Op.getOperand(0).getValueType();
38924 if (SrcVT.isVector()) {
38925 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38926 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38927 Known.setAllZero();
38928 }
38929 break;
38930 }
38937 // Strict Conversions - upper elements are known zero.
38938 EVT SrcVT = Op.getOperand(1).getValueType();
38939 if (SrcVT.isVector()) {
38940 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38941 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38942 Known.setAllZero();
38943 }
38944 break;
38945 }
38946 case X86ISD::MOVQ2DQ: {
38947 // Move from MMX to XMM. Upper half of XMM should be 0.
38948 if (DemandedElts.countr_zero() >= (NumElts / 2))
38949 Known.setAllZero();
38950 break;
38951 }
38953 APInt UndefElts;
38954 SmallVector<APInt, 16> EltBits;
38955 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38956 /*AllowWholeUndefs*/ false,
38957 /*AllowPartialUndefs*/ false)) {
38958 Known.Zero.setAllBits();
38959 Known.One.setAllBits();
38960 for (unsigned I = 0; I != NumElts; ++I) {
38961 if (!DemandedElts[I])
38962 continue;
38963 if (UndefElts[I]) {
38964 Known.resetAll();
38965 break;
38966 }
38967 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38968 Known = Known.intersectWith(Known2);
38969 }
38970 return;
38971 }
38972 break;
38973 }
38974 case X86ISD::HADD:
38975 case X86ISD::HSUB: {
38977 Op, DemandedElts, Depth, DAG,
38978 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38980 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38981 KnownLHS, KnownRHS);
38982 });
38983 break;
38984 }
38986 switch (Op->getConstantOperandVal(0)) {
38987 case Intrinsic::x86_sse2_pmadd_wd:
38988 case Intrinsic::x86_avx2_pmadd_wd:
38989 case Intrinsic::x86_avx512_pmaddw_d_512: {
38990 SDValue LHS = Op.getOperand(1);
38991 SDValue RHS = Op.getOperand(2);
38992 assert(VT.getScalarType() == MVT::i32 &&
38993 LHS.getValueType() == RHS.getValueType() &&
38994 LHS.getValueType().getScalarType() == MVT::i16 &&
38995 "Unexpected PMADDWD types");
38996 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38997 break;
38998 }
38999 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
39000 case Intrinsic::x86_avx2_pmadd_ub_sw:
39001 case Intrinsic::x86_avx512_pmaddubs_w_512: {
39002 SDValue LHS = Op.getOperand(1);
39003 SDValue RHS = Op.getOperand(2);
39004 assert(VT.getScalarType() == MVT::i16 &&
39005 LHS.getValueType() == RHS.getValueType() &&
39006 LHS.getValueType().getScalarType() == MVT::i8 &&
39007 "Unexpected PMADDUBSW types");
39008 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39009 break;
39010 }
39011 case Intrinsic::x86_sse2_psad_bw:
39012 case Intrinsic::x86_avx2_psad_bw:
39013 case Intrinsic::x86_avx512_psad_bw_512: {
39014 SDValue LHS = Op.getOperand(1);
39015 SDValue RHS = Op.getOperand(2);
39016 assert(VT.getScalarType() == MVT::i64 &&
39017 LHS.getValueType() == RHS.getValueType() &&
39018 LHS.getValueType().getScalarType() == MVT::i8 &&
39019 "Unexpected PSADBW types");
39020 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39021 break;
39022 }
39023 }
39024 break;
39025 }
39026 case X86ISD::VPMADD52L:
39027 case X86ISD::VPMADD52H: {
39028 assert(Op.getValueType().isVector() &&
39029 Op.getValueType().getScalarType() == MVT::i64 &&
39030 "Unexpected VPMADD52 type");
39031 KnownBits K0 =
39032 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39033 KnownBits K1 =
39034 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39035 KnownBits KAcc =
39036 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39037 K0 = K0.trunc(52);
39038 K1 = K1.trunc(52);
39039 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39040 ? KnownBits::mul(K0, K1)
39041 : KnownBits::mulhu(K0, K1);
39042 KnownMul = KnownMul.zext(64);
39043 Known = KnownBits::add(KAcc, KnownMul);
39044 return;
39045 }
39046 }
39047
39048 // Handle target shuffles.
39049 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39050 if (isTargetShuffle(Opc)) {
39053 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39054 unsigned NumOps = Ops.size();
39055 unsigned NumElts = VT.getVectorNumElements();
39056 if (Mask.size() == NumElts) {
39057 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39058 Known.Zero.setAllBits(); Known.One.setAllBits();
39059 for (unsigned i = 0; i != NumElts; ++i) {
39060 if (!DemandedElts[i])
39061 continue;
39062 int M = Mask[i];
39063 if (M == SM_SentinelUndef) {
39064 // For UNDEF elements, we don't know anything about the common state
39065 // of the shuffle result.
39066 Known.resetAll();
39067 break;
39068 }
39069 if (M == SM_SentinelZero) {
39070 Known.One.clearAllBits();
39071 continue;
39072 }
39073 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39074 "Shuffle index out of range");
39075
39076 unsigned OpIdx = (unsigned)M / NumElts;
39077 unsigned EltIdx = (unsigned)M % NumElts;
39078 if (Ops[OpIdx].getValueType() != VT) {
39079 // TODO - handle target shuffle ops with different value types.
39080 Known.resetAll();
39081 break;
39082 }
39083 DemandedOps[OpIdx].setBit(EltIdx);
39084 }
39085 // Known bits are the values that are shared by every demanded element.
39086 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39087 if (!DemandedOps[i])
39088 continue;
39089 KnownBits Known2 =
39090 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39091 Known = Known.intersectWith(Known2);
39092 }
39093 }
39094 }
39095 }
39096}
39097
39099 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39100 unsigned Depth) const {
39101 EVT VT = Op.getValueType();
39102 unsigned VTBits = VT.getScalarSizeInBits();
39103 unsigned Opcode = Op.getOpcode();
39104 switch (Opcode) {
39106 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39107 return VTBits;
39108
39109 case X86ISD::VTRUNC: {
39110 SDValue Src = Op.getOperand(0);
39111 MVT SrcVT = Src.getSimpleValueType();
39112 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39113 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39114 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39115 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39116 if (Tmp > (NumSrcBits - VTBits))
39117 return Tmp - (NumSrcBits - VTBits);
39118 return 1;
39119 }
39120
39121 case X86ISD::PACKSS: {
39122 // PACKSS is just a truncation if the sign bits extend to the packed size.
39123 APInt DemandedLHS, DemandedRHS;
39124 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39125 DemandedRHS);
39126
39127 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39128 // patterns often used to compact vXi64 allsignbit patterns.
39129 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39131 if (BC.getOpcode() == X86ISD::PACKSS &&
39132 BC.getScalarValueSizeInBits() == 16 &&
39133 V.getScalarValueSizeInBits() == 32) {
39136 if (BC0.getScalarValueSizeInBits() == 64 &&
39137 BC1.getScalarValueSizeInBits() == 64 &&
39138 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39139 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39140 return 32;
39141 }
39142 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39143 };
39144
39145 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39146 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39147 if (!!DemandedLHS)
39148 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39149 if (!!DemandedRHS)
39150 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39151 unsigned Tmp = std::min(Tmp0, Tmp1);
39152 if (Tmp > (SrcBits - VTBits))
39153 return Tmp - (SrcBits - VTBits);
39154 return 1;
39155 }
39156
39157 case X86ISD::VBROADCAST: {
39158 SDValue Src = Op.getOperand(0);
39159 if (!Src.getSimpleValueType().isVector())
39160 return DAG.ComputeNumSignBits(Src, Depth + 1);
39161 break;
39162 }
39163
39164 case X86ISD::VSHLI: {
39165 SDValue Src = Op.getOperand(0);
39166 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39167 if (ShiftVal.uge(VTBits))
39168 return VTBits; // Shifted all bits out --> zero.
39169 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39170 if (ShiftVal.uge(Tmp))
39171 return 1; // Shifted all sign bits out --> unknown.
39172 return Tmp - ShiftVal.getZExtValue();
39173 }
39174
39175 case X86ISD::VSRAI: {
39176 SDValue Src = Op.getOperand(0);
39177 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39178 if (ShiftVal.uge(VTBits - 1))
39179 return VTBits; // Sign splat.
39180 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39181 ShiftVal += Tmp;
39182 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39183 }
39184
39185 case X86ISD::FSETCC:
39186 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39187 if (VT == MVT::f32 || VT == MVT::f64 ||
39188 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39189 return VTBits;
39190 break;
39191
39192 case X86ISD::PCMPGT:
39193 case X86ISD::PCMPEQ:
39194 case X86ISD::CMPP:
39195 case X86ISD::VPCOM:
39196 case X86ISD::VPCOMU:
39197 // Vector compares return zero/all-bits result values.
39198 return VTBits;
39199
39200 case X86ISD::ANDNP: {
39201 unsigned Tmp0 =
39202 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39203 if (Tmp0 == 1) return 1; // Early out.
39204 unsigned Tmp1 =
39205 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39206 return std::min(Tmp0, Tmp1);
39207 }
39208
39209 case X86ISD::CMOV: {
39210 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39211 if (Tmp0 == 1) return 1; // Early out.
39212 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39213 return std::min(Tmp0, Tmp1);
39214 }
39215 }
39216
39217 // Handle target shuffles.
39218 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39219 if (isTargetShuffle(Opcode)) {
39222 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39223 unsigned NumOps = Ops.size();
39224 unsigned NumElts = VT.getVectorNumElements();
39225 if (Mask.size() == NumElts) {
39226 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39227 for (unsigned i = 0; i != NumElts; ++i) {
39228 if (!DemandedElts[i])
39229 continue;
39230 int M = Mask[i];
39231 if (M == SM_SentinelUndef) {
39232 // For UNDEF elements, we don't know anything about the common state
39233 // of the shuffle result.
39234 return 1;
39235 } else if (M == SM_SentinelZero) {
39236 // Zero = all sign bits.
39237 continue;
39238 }
39239 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39240 "Shuffle index out of range");
39241
39242 unsigned OpIdx = (unsigned)M / NumElts;
39243 unsigned EltIdx = (unsigned)M % NumElts;
39244 if (Ops[OpIdx].getValueType() != VT) {
39245 // TODO - handle target shuffle ops with different value types.
39246 return 1;
39247 }
39248 DemandedOps[OpIdx].setBit(EltIdx);
39249 }
39250 unsigned Tmp0 = VTBits;
39251 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39252 if (!DemandedOps[i])
39253 continue;
39254 unsigned Tmp1 =
39255 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39256 Tmp0 = std::min(Tmp0, Tmp1);
39257 }
39258 return Tmp0;
39259 }
39260 }
39261 }
39262
39263 // Fallback case.
39264 return 1;
39265}
39266
39268 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39269 return N->getOperand(0);
39270 return N;
39271}
39272
39273// Helper to look for a normal load that can be narrowed into a vzload with the
39274// specified VT and memory VT. Returns SDValue() on failure.
39276 SelectionDAG &DAG) {
39277 // Can't if the load is volatile or atomic.
39278 if (!LN->isSimple())
39279 return SDValue();
39280
39281 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39282 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39283 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39284 LN->getPointerInfo(), LN->getBaseAlign(),
39285 LN->getMemOperand()->getFlags());
39286}
39287
39288// Attempt to match a combined shuffle mask against supported unary shuffle
39289// instructions.
39290// TODO: Investigate sharing more of this with shuffle lowering.
39291static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39292 bool AllowFloatDomain, bool AllowIntDomain,
39293 SDValue V1, const SelectionDAG &DAG,
39294 const X86Subtarget &Subtarget, unsigned &Shuffle,
39295 MVT &SrcVT, MVT &DstVT) {
39296 unsigned NumMaskElts = Mask.size();
39297 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39298
39299 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39300 if (Mask[0] == 0 &&
39301 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39302 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39304 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39305 Shuffle = X86ISD::VZEXT_MOVL;
39306 if (MaskEltSize == 16)
39307 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39308 else
39309 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39310 return true;
39311 }
39312 }
39313
39314 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39315 if (AllowIntDomain &&
39316 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39317 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39318 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39319 unsigned MaxScale = 64 / MaskEltSize;
39320 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39321 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39322 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39323 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39324 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39325 continue;
39326 bool MatchAny = true;
39327 bool MatchZero = true;
39328 bool MatchSign = UseSign;
39329 unsigned NumDstElts = NumMaskElts / Scale;
39330 for (unsigned i = 0;
39331 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39332 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39333 MatchAny = MatchSign = MatchZero = false;
39334 break;
39335 }
39336 unsigned Pos = (i * Scale) + 1;
39337 unsigned Len = Scale - 1;
39338 MatchAny &= isUndefInRange(Mask, Pos, Len);
39339 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39340 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39341 }
39342 if (MatchAny || MatchSign || MatchZero) {
39343 assert((MatchSign || MatchZero) &&
39344 "Failed to match sext/zext but matched aext?");
39345 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39346 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39347 : MVT::getIntegerVT(MaskEltSize);
39348 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39349
39350 Shuffle = unsigned(
39351 MatchAny ? ISD::ANY_EXTEND
39352 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39353 if (SrcVT.getVectorNumElements() != NumDstElts)
39354 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39355
39356 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39357 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39358 return true;
39359 }
39360 }
39361 }
39362
39363 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39364 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39365 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39366 isUndefOrEqual(Mask[0], 0) &&
39367 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39368 Shuffle = X86ISD::VZEXT_MOVL;
39369 if (MaskEltSize == 16)
39370 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39371 else
39372 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39373 return true;
39374 }
39375
39376 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39377 // instructions are no slower than UNPCKLPD but has the option to
39378 // fold the input operand into even an unaligned memory load.
39379 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39380 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39381 Shuffle = X86ISD::MOVDDUP;
39382 SrcVT = DstVT = MVT::v2f64;
39383 return true;
39384 }
39385 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39386 Shuffle = X86ISD::MOVSLDUP;
39387 SrcVT = DstVT = MVT::v4f32;
39388 return true;
39389 }
39390 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39391 Shuffle = X86ISD::MOVSHDUP;
39392 SrcVT = DstVT = MVT::v4f32;
39393 return true;
39394 }
39395 }
39396
39397 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39398 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39399 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39400 Shuffle = X86ISD::MOVDDUP;
39401 SrcVT = DstVT = MVT::v4f64;
39402 return true;
39403 }
39404 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39405 V1)) {
39406 Shuffle = X86ISD::MOVSLDUP;
39407 SrcVT = DstVT = MVT::v8f32;
39408 return true;
39409 }
39410 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39411 V1)) {
39412 Shuffle = X86ISD::MOVSHDUP;
39413 SrcVT = DstVT = MVT::v8f32;
39414 return true;
39415 }
39416 }
39417
39418 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39419 assert(Subtarget.hasAVX512() &&
39420 "AVX512 required for 512-bit vector shuffles");
39421 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39422 V1)) {
39423 Shuffle = X86ISD::MOVDDUP;
39424 SrcVT = DstVT = MVT::v8f64;
39425 return true;
39426 }
39428 MaskVT, Mask,
39429 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39430 Shuffle = X86ISD::MOVSLDUP;
39431 SrcVT = DstVT = MVT::v16f32;
39432 return true;
39433 }
39435 MaskVT, Mask,
39436 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39437 Shuffle = X86ISD::MOVSHDUP;
39438 SrcVT = DstVT = MVT::v16f32;
39439 return true;
39440 }
39441 }
39442
39443 return false;
39444}
39445
39446// Attempt to match a combined shuffle mask against supported unary immediate
39447// permute instructions.
39448// TODO: Investigate sharing more of this with shuffle lowering.
39450 const APInt &Zeroable,
39451 bool AllowFloatDomain, bool AllowIntDomain,
39452 const SelectionDAG &DAG,
39453 const X86Subtarget &Subtarget,
39454 unsigned &Shuffle, MVT &ShuffleVT,
39455 unsigned &PermuteImm) {
39456 unsigned NumMaskElts = Mask.size();
39457 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39458 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39459 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39460 bool ContainsZeros = isAnyZero(Mask);
39461
39462 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39463 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39464 // Check for lane crossing permutes.
39465 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39466 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39467 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39468 Shuffle = X86ISD::VPERMI;
39469 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39470 PermuteImm = getV4X86ShuffleImm(Mask);
39471 return true;
39472 }
39473 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39474 SmallVector<int, 4> RepeatedMask;
39475 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39476 Shuffle = X86ISD::VPERMI;
39477 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39478 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39479 return true;
39480 }
39481 }
39482 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39483 // VPERMILPD can permute with a non-repeating shuffle.
39484 Shuffle = X86ISD::VPERMILPI;
39485 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39486 PermuteImm = 0;
39487 for (int i = 0, e = Mask.size(); i != e; ++i) {
39488 int M = Mask[i];
39489 if (M == SM_SentinelUndef)
39490 continue;
39491 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39492 PermuteImm |= (M & 1) << i;
39493 }
39494 return true;
39495 }
39496 }
39497
39498 // We are checking for shuffle match or shift match. Loop twice so we can
39499 // order which we try and match first depending on target preference.
39500 for (unsigned Order = 0; Order < 2; ++Order) {
39501 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39502 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39503 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39504 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39505 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39506 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39507 SmallVector<int, 4> RepeatedMask;
39508 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39509 // Narrow the repeated mask to create 32-bit element permutes.
39510 SmallVector<int, 4> WordMask = RepeatedMask;
39511 if (MaskScalarSizeInBits == 64)
39512 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39513
39514 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39515 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39516 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39517 PermuteImm = getV4X86ShuffleImm(WordMask);
39518 return true;
39519 }
39520 }
39521
39522 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39523 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39524 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39525 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39526 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39527 SmallVector<int, 4> RepeatedMask;
39528 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39529 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39530 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39531
39532 // PSHUFLW: permute lower 4 elements only.
39533 if (isUndefOrInRange(LoMask, 0, 4) &&
39534 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39535 Shuffle = X86ISD::PSHUFLW;
39536 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39537 PermuteImm = getV4X86ShuffleImm(LoMask);
39538 return true;
39539 }
39540
39541 // PSHUFHW: permute upper 4 elements only.
39542 if (isUndefOrInRange(HiMask, 4, 8) &&
39543 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39544 // Offset the HiMask so that we can create the shuffle immediate.
39545 int OffsetHiMask[4];
39546 for (int i = 0; i != 4; ++i)
39547 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39548
39549 Shuffle = X86ISD::PSHUFHW;
39550 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39551 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39552 return true;
39553 }
39554 }
39555 }
39556 } else {
39557 // Attempt to match against bit rotates.
39558 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39559 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39560 Subtarget.hasAVX512())) {
39561 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39562 Subtarget, Mask);
39563 if (0 < RotateAmt) {
39564 Shuffle = X86ISD::VROTLI;
39565 PermuteImm = (unsigned)RotateAmt;
39566 return true;
39567 }
39568 }
39569 }
39570 // Attempt to match against byte/bit shifts.
39571 if (AllowIntDomain &&
39572 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39573 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39574 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39575 int ShiftAmt =
39576 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39577 Zeroable, Subtarget);
39578 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39579 32 <= ShuffleVT.getScalarSizeInBits())) {
39580 // Byte shifts can be slower so only match them on second attempt.
39581 if (Order == 0 &&
39582 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39583 continue;
39584
39585 PermuteImm = (unsigned)ShiftAmt;
39586 return true;
39587 }
39588
39589 }
39590 }
39591
39592 return false;
39593}
39594
39595// Attempt to match a combined unary shuffle mask against supported binary
39596// shuffle instructions.
39597// TODO: Investigate sharing more of this with shuffle lowering.
39598static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39599 bool AllowFloatDomain, bool AllowIntDomain,
39600 SDValue &V1, SDValue &V2, const SDLoc &DL,
39601 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39602 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39603 bool IsUnary) {
39604 unsigned NumMaskElts = Mask.size();
39605 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39606 unsigned SizeInBits = MaskVT.getSizeInBits();
39607
39608 if (MaskVT.is128BitVector()) {
39609 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39610 AllowFloatDomain) {
39611 V2 = V1;
39612 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39613 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39614 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39615 return true;
39616 }
39617 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39618 AllowFloatDomain) {
39619 V2 = V1;
39620 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39621 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39622 return true;
39623 }
39624 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39625 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39626 std::swap(V1, V2);
39627 Shuffle = X86ISD::MOVSD;
39628 SrcVT = DstVT = MVT::v2f64;
39629 return true;
39630 }
39631 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39632 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39633 Shuffle = X86ISD::MOVSS;
39634 SrcVT = DstVT = MVT::v4f32;
39635 return true;
39636 }
39637 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39638 DAG) &&
39639 Subtarget.hasFP16()) {
39640 Shuffle = X86ISD::MOVSH;
39641 SrcVT = DstVT = MVT::v8f16;
39642 return true;
39643 }
39644 }
39645
39646 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39647 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39648 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39649 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39650 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39651 Subtarget)) {
39652 DstVT = MaskVT;
39653 return true;
39654 }
39655 }
39656 // TODO: Can we handle this inside matchShuffleWithPACK?
39657 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39658 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39659 V1.getScalarValueSizeInBits() == 64 &&
39660 V2.getScalarValueSizeInBits() == 64) {
39661 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39662 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39663 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39664 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39665 SrcVT = MVT::v4i32;
39666 DstVT = MVT::v8i16;
39667 Shuffle = X86ISD::PACKUS;
39668 return true;
39669 }
39670 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39671 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39672 SrcVT = MVT::v8i16;
39673 DstVT = MVT::v16i8;
39674 Shuffle = X86ISD::PACKUS;
39675 return true;
39676 }
39677 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39678 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39679 SrcVT = MVT::v4i32;
39680 DstVT = MVT::v8i16;
39681 Shuffle = X86ISD::PACKSS;
39682 return true;
39683 }
39684 }
39685
39686 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39687 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39688 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39689 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39690 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39691 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39692 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39693 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39694 Subtarget)) {
39695 SrcVT = DstVT = MaskVT;
39696 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39697 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39698 return true;
39699 }
39700 }
39701
39702 // Attempt to match against a OR if we're performing a blend shuffle and the
39703 // non-blended source element is zero in each case.
39704 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39705 if (SizeInBits == V1.getValueSizeInBits() &&
39706 SizeInBits == V2.getValueSizeInBits() &&
39707 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39708 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39709 bool IsBlend = true;
39710 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39711 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39712 unsigned Scale1 = NumV1Elts / NumMaskElts;
39713 unsigned Scale2 = NumV2Elts / NumMaskElts;
39714 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39715 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39716 for (unsigned i = 0; i != NumMaskElts; ++i) {
39717 int M = Mask[i];
39718 if (M == SM_SentinelUndef)
39719 continue;
39720 if (M == SM_SentinelZero) {
39721 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39722 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39723 continue;
39724 }
39725 if (M == (int)i) {
39726 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39727 continue;
39728 }
39729 if (M == (int)(i + NumMaskElts)) {
39730 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39731 continue;
39732 }
39733 IsBlend = false;
39734 break;
39735 }
39736 if (IsBlend) {
39737 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39738 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39739 Shuffle = ISD::OR;
39740 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39741 return true;
39742 }
39743 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39744 // FIXME: handle mismatched sizes?
39745 // TODO: investigate if `ISD::OR` handling in
39746 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39747 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39748 unsigned NumElts = V.getValueType().getVectorNumElements();
39749 KnownBits Known(NumElts);
39750 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39751 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39752 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39753 if (PeepholeKnown.isZero())
39754 Known.Zero.setBit(EltIdx);
39755 if (PeepholeKnown.isAllOnes())
39756 Known.One.setBit(EltIdx);
39757 }
39758 return Known;
39759 };
39760
39761 KnownBits V1Known = computeKnownBitsElementWise(V1);
39762 KnownBits V2Known = computeKnownBitsElementWise(V2);
39763
39764 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39765 int M = Mask[i];
39766 if (M == SM_SentinelUndef)
39767 continue;
39768 if (M == SM_SentinelZero) {
39769 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39770 continue;
39771 }
39772 if (M == (int)i) {
39773 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39774 continue;
39775 }
39776 if (M == (int)(i + NumMaskElts)) {
39777 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39778 continue;
39779 }
39780 llvm_unreachable("will not get here.");
39781 }
39782 if (IsBlend) {
39783 Shuffle = ISD::OR;
39784 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39785 return true;
39786 }
39787 }
39788 }
39789 }
39790
39791 return false;
39792}
39793
39795 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39796 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39797 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39798 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39799 unsigned NumMaskElts = Mask.size();
39800 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39801
39802 // Attempt to match against VALIGND/VALIGNQ rotate.
39803 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39804 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39805 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39806 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39807 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39808 MaskVT.getSizeInBits() / EltSizeInBits);
39809 if (!isAnyZero(Mask)) {
39810 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39811 if (0 < Rotation) {
39812 Shuffle = X86ISD::VALIGN;
39813 ShuffleVT = AlignVT;
39814 PermuteImm = Rotation;
39815 return true;
39816 }
39817 }
39818 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39819 unsigned ZeroLo = Zeroable.countr_one();
39820 unsigned ZeroHi = Zeroable.countl_one();
39821 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39822 if (ZeroLo) {
39823 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39824 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39825 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39826 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39827 Shuffle = X86ISD::VALIGN;
39828 ShuffleVT = AlignVT;
39829 PermuteImm = NumMaskElts - ZeroLo;
39830 return true;
39831 }
39832 }
39833 if (ZeroHi) {
39834 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39835 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39836 ZeroHi);
39837 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39838 V2 = V1;
39839 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39840 Shuffle = X86ISD::VALIGN;
39841 ShuffleVT = AlignVT;
39842 PermuteImm = ZeroHi;
39843 return true;
39844 }
39845 }
39846 }
39847
39848 // Attempt to match against PALIGNR byte rotate.
39849 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39850 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39851 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39852 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39853 if (0 < ByteRotation) {
39854 Shuffle = X86ISD::PALIGNR;
39855 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39856 PermuteImm = ByteRotation;
39857 return true;
39858 }
39859 }
39860
39861 // Attempt to combine to X86ISD::BLENDI.
39862 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39863 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39864 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39865 uint64_t BlendMask = 0;
39866 bool ForceV1Zero = false, ForceV2Zero = false;
39867 SmallVector<int, 8> TargetMask(Mask);
39868 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39869 ForceV2Zero, BlendMask)) {
39870 if (MaskVT == MVT::v16i16) {
39871 // We can only use v16i16 PBLENDW if the lanes are repeated.
39872 SmallVector<int, 8> RepeatedMask;
39873 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39874 RepeatedMask)) {
39875 assert(RepeatedMask.size() == 8 &&
39876 "Repeated mask size doesn't match!");
39877 PermuteImm = 0;
39878 for (int i = 0; i < 8; ++i)
39879 if (RepeatedMask[i] >= 8)
39880 PermuteImm |= 1 << i;
39881 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39882 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39883 Shuffle = X86ISD::BLENDI;
39884 ShuffleVT = MaskVT;
39885 return true;
39886 }
39887 } else {
39888 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39889 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39890 PermuteImm = (unsigned)BlendMask;
39891 Shuffle = X86ISD::BLENDI;
39892 ShuffleVT = MaskVT;
39893 return true;
39894 }
39895 }
39896 }
39897
39898 // Attempt to combine to INSERTPS, but only if it has elements that need to
39899 // be set to zero.
39900 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39901 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39902 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39903 Shuffle = X86ISD::INSERTPS;
39904 ShuffleVT = MVT::v4f32;
39905 return true;
39906 }
39907
39908 // Attempt to combine to SHUFPD.
39909 if (AllowFloatDomain && EltSizeInBits == 64 &&
39910 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39911 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39912 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39913 bool ForceV1Zero = false, ForceV2Zero = false;
39914 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39915 PermuteImm, Mask, Zeroable)) {
39916 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39917 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39918 Shuffle = X86ISD::SHUFP;
39919 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39920 return true;
39921 }
39922 }
39923
39924 // Attempt to combine to SHUFPS.
39925 if (AllowFloatDomain && EltSizeInBits == 32 &&
39926 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39927 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39928 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39929 SmallVector<int, 4> RepeatedMask;
39930 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39931 // Match each half of the repeated mask, to determine if its just
39932 // referencing one of the vectors, is zeroable or entirely undef.
39933 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39934 int M0 = RepeatedMask[Offset];
39935 int M1 = RepeatedMask[Offset + 1];
39936
39937 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39938 return DAG.getUNDEF(MaskVT);
39939 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39940 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39941 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39942 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39943 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39944 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39945 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39946 return V1;
39947 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39948 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39949 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39950 return V2;
39951 }
39952
39953 return SDValue();
39954 };
39955
39956 int ShufMask[4] = {-1, -1, -1, -1};
39957 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39958 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39959
39960 if (Lo && Hi) {
39961 V1 = Lo;
39962 V2 = Hi;
39963 Shuffle = X86ISD::SHUFP;
39964 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39965 PermuteImm = getV4X86ShuffleImm(ShufMask);
39966 return true;
39967 }
39968 }
39969 }
39970
39971 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39972 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39973 MaskVT.is128BitVector() &&
39974 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39975 Shuffle = X86ISD::INSERTPS;
39976 ShuffleVT = MVT::v4f32;
39977 return true;
39978 }
39979
39980 return false;
39981}
39982
39984 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39985 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39986 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39987 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39988 const X86Subtarget &Subtarget);
39989
39990/// Combine an arbitrary chain of shuffles into a single instruction if
39991/// possible.
39992///
39993/// This is the leaf of the recursive combine below. When we have found some
39994/// chain of single-use x86 shuffle instructions and accumulated the combined
39995/// shuffle mask represented by them, this will try to pattern match that mask
39996/// into either a single instruction if there is a special purpose instruction
39997/// for this operation, or into a PSHUFB instruction which is a fully general
39998/// instruction but should only be used to replace chains over a certain depth.
40000 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
40001 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40002 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40003 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40004 const X86Subtarget &Subtarget) {
40005 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
40006 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
40007 "Unexpected number of shuffle inputs!");
40008 unsigned RootSizeInBits = RootVT.getSizeInBits();
40009 unsigned NumRootElts = RootVT.getVectorNumElements();
40010
40011 // Canonicalize shuffle input op to the requested type.
40012 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
40013 if (VT.getSizeInBits() > Op.getValueSizeInBits())
40014 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
40015 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40016 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40017 return DAG.getBitcast(VT, Op);
40018 };
40019
40020 // Find the inputs that enter the chain. Note that multiple uses are OK
40021 // here, we're not going to remove the operands we find.
40022 bool UnaryShuffle = (Inputs.size() == 1);
40023 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40024 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40025 : peekThroughBitcasts(Inputs[1]));
40026
40027 MVT VT1 = V1.getSimpleValueType();
40028 MVT VT2 = V2.getSimpleValueType();
40029 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40030 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40031
40032 SDValue Res;
40033
40034 unsigned NumBaseMaskElts = BaseMask.size();
40035 if (NumBaseMaskElts == 1) {
40036 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40037 return CanonicalizeShuffleInput(RootVT, V1);
40038 }
40039
40040 bool OptForSize = DAG.shouldOptForSize();
40041 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40042 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40043 (RootVT.isFloatingPoint() && Depth >= 1) ||
40044 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40045
40046 // If we are shuffling a splat (and not introducing zeros) then we can just
40047 // use it directly. This works for smaller elements as well as they already
40048 // repeat across each mask element.
40049 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40050 V1.getValueSizeInBits() >= RootSizeInBits &&
40051 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40052 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40053 return CanonicalizeShuffleInput(RootVT, V1);
40054 }
40055
40056 SmallVector<int, 64> Mask(BaseMask);
40057
40058 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40059 // etc. can be simplified.
40060 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40061 SmallVector<int> ScaledMask, IdentityMask;
40062 unsigned NumElts = VT1.getVectorNumElements();
40063 if (Mask.size() <= NumElts &&
40064 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40065 for (unsigned i = 0; i != NumElts; ++i)
40066 IdentityMask.push_back(i);
40067 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40068 V2))
40069 return CanonicalizeShuffleInput(RootVT, V1);
40070 }
40071 }
40072
40073 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40074 if (RootVT.is512BitVector() &&
40075 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40076 // If the upper subvectors are zeroable, then an extract+insert is more
40077 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40078 // to zero the upper subvectors.
40079 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40080 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40081 return SDValue(); // Nothing to do!
40082 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40083 "Unexpected lane shuffle");
40084 Res = CanonicalizeShuffleInput(RootVT, V1);
40085 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40086 bool UseZero = isAnyZero(Mask);
40087 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40088 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40089 }
40090
40091 // Narrow shuffle mask to v4x128.
40092 SmallVector<int, 4> ScaledMask;
40093 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40094 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40095
40096 // Try to lower to vshuf64x2/vshuf32x4.
40097 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40098 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40099 SelectionDAG &DAG) {
40100 int PermMask[4] = {-1, -1, -1, -1};
40101 // Ensure elements came from the same Op.
40102 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40103 for (int i = 0; i < 4; ++i) {
40104 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40105 if (ScaledMask[i] < 0)
40106 continue;
40107
40108 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40109 unsigned OpIndex = i / 2;
40110 if (Ops[OpIndex].isUndef())
40111 Ops[OpIndex] = Op;
40112 else if (Ops[OpIndex] != Op)
40113 return SDValue();
40114
40115 PermMask[i] = ScaledMask[i] % 4;
40116 }
40117
40118 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40119 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40120 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40121 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40122 };
40123
40124 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40125 // doesn't work because our mask is for 128 bits and we don't have an MVT
40126 // to match that.
40127 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40128 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40129 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40130 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40131 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40132 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40133 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40134 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40135 ScaledMask[1] == (ScaledMask[3] % 2));
40136
40137 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40138 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40139 return SDValue(); // Nothing to do!
40140 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40141 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40142 return DAG.getBitcast(RootVT, V);
40143 }
40144 }
40145
40146 // Handle 128-bit lane shuffles of 256-bit vectors.
40147 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40148 // If the upper half is zeroable, then an extract+insert is more optimal
40149 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40150 // zero the upper half.
40151 if (isUndefOrZero(Mask[1])) {
40152 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40153 return SDValue(); // Nothing to do!
40154 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40155 Res = CanonicalizeShuffleInput(RootVT, V1);
40156 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40157 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40158 256);
40159 }
40160
40161 // If we're inserting the low subvector, an insert-subvector 'concat'
40162 // pattern is quicker than VPERM2X128.
40163 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40164 !Subtarget.hasAVX2()) {
40165 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40166 return SDValue(); // Nothing to do!
40167 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40168 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40169 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40170 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40171 }
40172
40173 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40174 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40175 // feature.
40176 // Prefer blends for sequential shuffles unless we are optimizing for size.
40177 if (UnaryShuffle &&
40178 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40179 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40180 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40181 return SDValue(); // Nothing to do!
40182 unsigned PermMask = 0;
40183 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40184 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40185 return DAG.getNode(
40186 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40187 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40188 }
40189
40190 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40191 return SDValue(); // Nothing to do!
40192
40193 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40194 if (!UnaryShuffle && !IsMaskedShuffle) {
40195 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40196 "Unexpected shuffle sentinel value");
40197 // Prefer blends to X86ISD::VPERM2X128.
40198 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40199 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40200 return SDValue(); // Nothing to do!
40201 unsigned PermMask = 0;
40202 PermMask |= ((Mask[0] & 3) << 0);
40203 PermMask |= ((Mask[1] & 3) << 4);
40204 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40205 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40206 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40207 CanonicalizeShuffleInput(RootVT, LHS),
40208 CanonicalizeShuffleInput(RootVT, RHS),
40209 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40210 }
40211 }
40212 }
40213
40214 // For masks that have been widened to 128-bit elements or more,
40215 // narrow back down to 64-bit elements.
40216 if (BaseMaskEltSizeInBits > 64) {
40217 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40218 int MaskScale = BaseMaskEltSizeInBits / 64;
40219 SmallVector<int, 64> ScaledMask;
40220 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40221 Mask = std::move(ScaledMask);
40222 }
40223
40224 // For masked shuffles, we're trying to match the root width for better
40225 // writemask folding, attempt to scale the mask.
40226 // TODO - variable shuffles might need this to be widened again.
40227 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40228 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40229 int MaskScale = NumRootElts / Mask.size();
40230 SmallVector<int, 64> ScaledMask;
40231 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40232 Mask = std::move(ScaledMask);
40233 }
40234
40235 unsigned NumMaskElts = Mask.size();
40236 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40237 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40238
40239 // Determine the effective mask value type.
40240 FloatDomain &= (32 <= MaskEltSizeInBits);
40241 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40242 : MVT::getIntegerVT(MaskEltSizeInBits);
40243 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40244
40245 // Only allow legal mask types.
40246 if (!TLI.isTypeLegal(MaskVT))
40247 return SDValue();
40248
40249 // Attempt to match the mask against known shuffle patterns.
40250 MVT ShuffleSrcVT, ShuffleVT;
40251 unsigned Shuffle, PermuteImm;
40252
40253 // Which shuffle domains are permitted?
40254 // Permit domain crossing at higher combine depths.
40255 // TODO: Should we indicate which domain is preferred if both are allowed?
40256 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40257 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40258 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40259
40260 // Determine zeroable mask elements.
40261 APInt KnownUndef, KnownZero;
40262 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40263 APInt Zeroable = KnownUndef | KnownZero;
40264
40265 if (UnaryShuffle) {
40266 // Attempt to match against broadcast-from-vector.
40267 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40268 if ((Subtarget.hasAVX2() ||
40269 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40270 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40271 if (isUndefOrEqual(Mask, 0)) {
40272 if (V1.getValueType() == MaskVT &&
40274 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40275 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40276 return SDValue(); // Nothing to do!
40277 Res = V1.getOperand(0);
40278 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40279 return DAG.getBitcast(RootVT, Res);
40280 }
40281 if (Subtarget.hasAVX2()) {
40282 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40283 return SDValue(); // Nothing to do!
40284 Res = CanonicalizeShuffleInput(MaskVT, V1);
40285 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40286 return DAG.getBitcast(RootVT, Res);
40287 }
40288 }
40289 }
40290
40291 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40292 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40293 (!IsMaskedShuffle ||
40294 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40295 if (Depth == 0 && RootOpc == Shuffle)
40296 return SDValue(); // Nothing to do!
40297 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40298 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40299 return DAG.getBitcast(RootVT, Res);
40300 }
40301
40302 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40303 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40304 PermuteImm) &&
40305 (!IsMaskedShuffle ||
40306 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40307 if (Depth == 0 && RootOpc == Shuffle)
40308 return SDValue(); // Nothing to do!
40309 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40310 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40311 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40312 return DAG.getBitcast(RootVT, Res);
40313 }
40314 }
40315
40316 // Attempt to combine to INSERTPS, but only if the inserted element has come
40317 // from a scalar.
40318 // TODO: Handle other insertions here as well?
40319 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40320 Subtarget.hasSSE41() &&
40321 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40322 if (MaskEltSizeInBits == 32) {
40323 SDValue SrcV1 = V1, SrcV2 = V2;
40324 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40325 DAG) &&
40326 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40327 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40328 return SDValue(); // Nothing to do!
40329 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40330 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40331 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40332 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40333 return DAG.getBitcast(RootVT, Res);
40334 }
40335 }
40336 if (MaskEltSizeInBits == 64 &&
40337 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40339 V2.getScalarValueSizeInBits() <= 32) {
40340 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40341 return SDValue(); // Nothing to do!
40342 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40343 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40344 CanonicalizeShuffleInput(MVT::v4f32, V1),
40345 CanonicalizeShuffleInput(MVT::v4f32, V2),
40346 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40347 return DAG.getBitcast(RootVT, Res);
40348 }
40349 }
40350
40351 SDValue NewV1 = V1; // Save operands in case early exit happens.
40352 SDValue NewV2 = V2;
40353 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40354 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40355 ShuffleVT, UnaryShuffle) &&
40356 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40357 if (Depth == 0 && RootOpc == Shuffle)
40358 return SDValue(); // Nothing to do!
40359 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40360 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40361 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40362 return DAG.getBitcast(RootVT, Res);
40363 }
40364
40365 NewV1 = V1; // Save operands in case early exit happens.
40366 NewV2 = V2;
40367 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40368 AllowIntDomain, NewV1, NewV2, DL, DAG,
40369 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40370 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40371 if (Depth == 0 && RootOpc == Shuffle)
40372 return SDValue(); // Nothing to do!
40373 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40374 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40375 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40376 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40377 return DAG.getBitcast(RootVT, Res);
40378 }
40379
40380 // Typically from here on, we need an integer version of MaskVT.
40381 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40382 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40383
40384 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40385 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40386 uint64_t BitLen, BitIdx;
40387 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40388 Zeroable)) {
40389 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40390 return SDValue(); // Nothing to do!
40391 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40392 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40393 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40394 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40395 return DAG.getBitcast(RootVT, Res);
40396 }
40397
40398 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40399 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40400 return SDValue(); // Nothing to do!
40401 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40402 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40403 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40404 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40405 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40406 return DAG.getBitcast(RootVT, Res);
40407 }
40408 }
40409
40410 // Match shuffle against TRUNCATE patterns.
40411 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40412 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40413 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40414 Subtarget)) {
40415 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40416 ShuffleSrcVT.getVectorNumElements();
40417 unsigned Opc =
40419 if (Depth == 0 && RootOpc == Opc)
40420 return SDValue(); // Nothing to do!
40421 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40422 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40423 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40424 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40425 return DAG.getBitcast(RootVT, Res);
40426 }
40427
40428 // Do we need a more general binary truncation pattern?
40429 if (RootSizeInBits < 512 &&
40430 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40431 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40432 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40433 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40434 // Bail if this was already a truncation or PACK node.
40435 // We sometimes fail to match PACK if we demand known undef elements.
40436 if (Depth == 0 &&
40437 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40438 RootOpc == X86ISD::PACKUS))
40439 return SDValue(); // Nothing to do!
40440 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40441 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40442 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40443 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40444 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40445 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40446 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40447 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40448 return DAG.getBitcast(RootVT, Res);
40449 }
40450 }
40451
40452 // Don't try to re-form single instruction chains under any circumstances now
40453 // that we've done encoding canonicalization for them.
40454 if (Depth < 1)
40455 return SDValue();
40456
40457 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40458 return isTargetShuffleVariableMask(N->getOpcode());
40459 });
40460 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40461 return (N->getOpcode() == X86ISD::VPERMV3 ||
40462 N->getOpcode() == X86ISD::VPERMV);
40463 });
40464
40465 // Depth threshold above which we can efficiently use variable mask shuffles.
40466 int VariableCrossLaneShuffleDepth =
40467 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40468 int VariablePerLaneShuffleDepth =
40469 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40470 AllowVariableCrossLaneMask &=
40471 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40472 AllowVariablePerLaneMask &=
40473 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40474 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40475 // higher depth before combining them.
40476 int BWIVPERMV3ShuffleDepth =
40477 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40478 bool AllowBWIVPERMV3 =
40479 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40480
40481 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40482 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40483 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40484
40485 bool MaskContainsZeros = isAnyZero(Mask);
40486
40487 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40488 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40489 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40490 if (Subtarget.hasAVX2() &&
40491 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40492 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40493 Res = CanonicalizeShuffleInput(MaskVT, V1);
40494 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40495 return DAG.getBitcast(RootVT, Res);
40496 }
40497 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40498 if ((Subtarget.hasAVX512() &&
40499 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40500 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40501 (Subtarget.hasBWI() &&
40502 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40503 (Subtarget.hasVBMI() &&
40504 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40505 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40506 V2 = DAG.getUNDEF(MaskVT);
40507 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40508 return DAG.getBitcast(RootVT, Res);
40509 }
40510 }
40511
40512 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40513 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40514 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40515 ((Subtarget.hasAVX512() &&
40516 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40517 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40518 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40519 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40520 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40521 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40522 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40523 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40524 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40525 for (unsigned i = 0; i != NumMaskElts; ++i)
40526 if (Mask[i] == SM_SentinelZero)
40527 Mask[i] = NumMaskElts + i;
40528 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40529 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40530 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40531 return DAG.getBitcast(RootVT, Res);
40532 }
40533
40534 // If that failed and either input is extracted then try to combine as a
40535 // shuffle with the larger type.
40537 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40538 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40539 IsMaskedShuffle, DAG, DL, Subtarget))
40540 return WideShuffle;
40541
40542 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40543 // (non-VLX will pad to 512-bit shuffles).
40544 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40545 ((Subtarget.hasAVX512() &&
40546 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40547 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40548 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40549 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40550 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40551 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40552 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40553 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40554 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40555 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40556 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40557 return DAG.getBitcast(RootVT, Res);
40558 }
40559 return SDValue();
40560 }
40561
40562 // See if we can combine a single input shuffle with zeros to a bit-mask,
40563 // which is much simpler than any shuffle.
40564 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40565 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40566 TLI.isTypeLegal(MaskVT)) {
40567 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40568 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40569 APInt UndefElts(NumMaskElts, 0);
40570 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40571 for (unsigned i = 0; i != NumMaskElts; ++i) {
40572 int M = Mask[i];
40573 if (M == SM_SentinelUndef) {
40574 UndefElts.setBit(i);
40575 continue;
40576 }
40577 if (M == SM_SentinelZero)
40578 continue;
40579 EltBits[i] = AllOnes;
40580 }
40581 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40582 Res = CanonicalizeShuffleInput(MaskVT, V1);
40583 unsigned AndOpcode =
40585 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40586 return DAG.getBitcast(RootVT, Res);
40587 }
40588
40589 // If we have a single input shuffle with different shuffle patterns in the
40590 // the 128-bit lanes use the variable mask to VPERMILPS.
40591 // TODO Combine other mask types at higher depths.
40592 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40593 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40594 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40595 SmallVector<SDValue, 16> VPermIdx;
40596 for (int M : Mask) {
40597 SDValue Idx =
40598 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40599 VPermIdx.push_back(Idx);
40600 }
40601 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40602 Res = CanonicalizeShuffleInput(MaskVT, V1);
40603 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40604 return DAG.getBitcast(RootVT, Res);
40605 }
40606
40607 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40608 // to VPERMIL2PD/VPERMIL2PS.
40609 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40610 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40611 MaskVT == MVT::v8f32)) {
40612 // VPERMIL2 Operation.
40613 // Bits[3] - Match Bit.
40614 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40615 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40616 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40617 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40618 SmallVector<int, 8> VPerm2Idx;
40619 unsigned M2ZImm = 0;
40620 for (int M : Mask) {
40621 if (M == SM_SentinelUndef) {
40622 VPerm2Idx.push_back(-1);
40623 continue;
40624 }
40625 if (M == SM_SentinelZero) {
40626 M2ZImm = 2;
40627 VPerm2Idx.push_back(8);
40628 continue;
40629 }
40630 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40631 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40632 VPerm2Idx.push_back(Index);
40633 }
40634 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40635 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40636 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40637 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40638 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40639 return DAG.getBitcast(RootVT, Res);
40640 }
40641
40642 // If we have 3 or more shuffle instructions or a chain involving a variable
40643 // mask, we can replace them with a single PSHUFB instruction profitably.
40644 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40645 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40646 // more aggressive.
40647 if (UnaryShuffle && AllowVariablePerLaneMask &&
40648 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40649 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40650 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40651 SmallVector<SDValue, 16> PSHUFBMask;
40652 int NumBytes = RootVT.getSizeInBits() / 8;
40653 int Ratio = NumBytes / NumMaskElts;
40654 for (int i = 0; i < NumBytes; ++i) {
40655 int M = Mask[i / Ratio];
40656 if (M == SM_SentinelUndef) {
40657 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40658 continue;
40659 }
40660 if (M == SM_SentinelZero) {
40661 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40662 continue;
40663 }
40664 M = Ratio * M + i % Ratio;
40665 assert((M / 16) == (i / 16) && "Lane crossing detected");
40666 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40667 }
40668 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40669 Res = CanonicalizeShuffleInput(ByteVT, V1);
40670 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40671 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40672 return DAG.getBitcast(RootVT, Res);
40673 }
40674
40675 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40676 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40677 // slower than PSHUFB on targets that support both.
40678 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40679 Subtarget.hasXOP()) {
40680 // VPPERM Mask Operation
40681 // Bits[4:0] - Byte Index (0 - 31)
40682 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40683 SmallVector<SDValue, 16> VPPERMMask;
40684 int NumBytes = 16;
40685 int Ratio = NumBytes / NumMaskElts;
40686 for (int i = 0; i < NumBytes; ++i) {
40687 int M = Mask[i / Ratio];
40688 if (M == SM_SentinelUndef) {
40689 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40690 continue;
40691 }
40692 if (M == SM_SentinelZero) {
40693 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40694 continue;
40695 }
40696 M = Ratio * M + i % Ratio;
40697 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40698 }
40699 MVT ByteVT = MVT::v16i8;
40700 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40701 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40702 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40703 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40704 return DAG.getBitcast(RootVT, Res);
40705 }
40706
40707 // If that failed and either input is extracted then try to combine as a
40708 // shuffle with the larger type.
40710 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40711 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40712 DAG, DL, Subtarget))
40713 return WideShuffle;
40714
40715 // If we have a dual input shuffle then lower to VPERMV3,
40716 // (non-VLX will pad to 512-bit shuffles)
40717 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40718 ((Subtarget.hasAVX512() &&
40719 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40720 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40721 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40722 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40723 MaskVT == MVT::v16i32)) ||
40724 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40725 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40726 MaskVT == MVT::v32i16)) ||
40727 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40728 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40729 MaskVT == MVT::v64i8)))) {
40730 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40731 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40732 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40733 return DAG.getBitcast(RootVT, Res);
40734 }
40735
40736 // Failed to find any combines.
40737 return SDValue();
40738}
40739
40740// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40741// instruction if possible.
40742//
40743// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40744// type size to attempt to combine:
40745// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40746// -->
40747// extract_subvector(shuffle(x,y,m2),0)
40749 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40750 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40751 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40752 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40753 const X86Subtarget &Subtarget) {
40754 unsigned NumMaskElts = BaseMask.size();
40755 unsigned NumInputs = Inputs.size();
40756 if (NumInputs == 0)
40757 return SDValue();
40758
40759 unsigned RootSizeInBits = RootVT.getSizeInBits();
40760 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40761 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40762
40763 // Peek through subvectors to find widest legal vector.
40764 // TODO: Handle ISD::TRUNCATE
40765 unsigned WideSizeInBits = RootSizeInBits;
40766 for (SDValue Input : Inputs) {
40768 while (1) {
40769 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40770 Input = peekThroughBitcasts(Input.getOperand(0));
40771 continue;
40772 }
40773 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40774 Input.getOperand(0).isUndef() &&
40775 isNullConstant(Input.getOperand(2))) {
40776 Input = peekThroughBitcasts(Input.getOperand(1));
40777 continue;
40778 }
40779 break;
40780 }
40781 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40782 WideSizeInBits < Input.getValueSizeInBits())
40783 WideSizeInBits = Input.getValueSizeInBits();
40784 }
40785
40786 // Bail if we fail to find a source larger than the existing root.
40787 if (WideSizeInBits <= RootSizeInBits ||
40788 (WideSizeInBits % RootSizeInBits) != 0)
40789 return SDValue();
40790
40791 // Create new mask for larger type.
40792 SmallVector<int, 64> WideMask;
40793 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40794
40795 // Attempt to peek through inputs and adjust mask when we extract from an
40796 // upper subvector.
40797 int AdjustedMasks = 0;
40798 SmallVector<SDValue, 4> WideInputs(Inputs);
40799 for (unsigned I = 0; I != NumInputs; ++I) {
40800 SDValue &Input = WideInputs[I];
40802 while (1) {
40803 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40804 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40805 uint64_t Idx = Input.getConstantOperandVal(1);
40806 if (Idx != 0) {
40807 ++AdjustedMasks;
40808 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40809 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40810
40811 int lo = I * WideMask.size();
40812 int hi = (I + 1) * WideMask.size();
40813 for (int &M : WideMask)
40814 if (lo <= M && M < hi)
40815 M += Idx;
40816 }
40817 Input = peekThroughBitcasts(Input.getOperand(0));
40818 continue;
40819 }
40820 // TODO: Handle insertions into upper subvectors.
40821 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40822 Input.getOperand(0).isUndef() &&
40823 isNullConstant(Input.getOperand(2))) {
40824 Input = peekThroughBitcasts(Input.getOperand(1));
40825 continue;
40826 }
40827 break;
40828 }
40829 }
40830
40831 // Remove unused/repeated shuffle source ops.
40832 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40833 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40834
40835 // Bail if we're always extracting from the lowest subvectors,
40836 // combineX86ShuffleChain should match this for the current width, or the
40837 // shuffle still references too many inputs.
40838 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40839 return SDValue();
40840
40841 // Minor canonicalization of the accumulated shuffle mask to make it easier
40842 // to match below. All this does is detect masks with sequential pairs of
40843 // elements, and shrink them to the half-width mask. It does this in a loop
40844 // so it will reduce the size of the mask to the minimal width mask which
40845 // performs an equivalent shuffle.
40846 while (WideMask.size() > 1) {
40847 SmallVector<int, 64> WidenedMask;
40848 if (!canWidenShuffleElements(WideMask, WidenedMask))
40849 break;
40850 WideMask = std::move(WidenedMask);
40851 }
40852
40853 // Canonicalization of binary shuffle masks to improve pattern matching by
40854 // commuting the inputs.
40855 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40857 std::swap(WideInputs[0], WideInputs[1]);
40858 }
40859
40860 // Increase depth for every upper subvector we've peeked through.
40861 Depth += AdjustedMasks;
40862
40863 // Attempt to combine wider chain.
40864 // TODO: Can we use a better Root?
40865 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40866 WideInputs.back().getValueSizeInBits()
40867 ? WideInputs.front()
40868 : WideInputs.back();
40869 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40870 "WideRootSize mismatch");
40871
40872 if (SDValue WideShuffle = combineX86ShuffleChain(
40873 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40874 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40875 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40876 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40877 return DAG.getBitcast(RootVT, WideShuffle);
40878 }
40879
40880 return SDValue();
40881}
40882
40883// Canonicalize the combined shuffle mask chain with horizontal ops.
40884// NOTE: This may update the Ops and Mask.
40887 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40888 const X86Subtarget &Subtarget) {
40889 if (Mask.empty() || Ops.empty())
40890 return SDValue();
40891
40893 for (SDValue Op : Ops)
40895
40896 // All ops must be the same horizop + type.
40897 SDValue BC0 = BC[0];
40898 EVT VT0 = BC0.getValueType();
40899 unsigned Opcode0 = BC0.getOpcode();
40900 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40901 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40902 }))
40903 return SDValue();
40904
40905 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40906 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40907 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40908 if (!isHoriz && !isPack)
40909 return SDValue();
40910
40911 // Do all ops have a single use?
40912 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40913 return Op.hasOneUse() &&
40915 });
40916
40917 int NumElts = VT0.getVectorNumElements();
40918 int NumLanes = VT0.getSizeInBits() / 128;
40919 int NumEltsPerLane = NumElts / NumLanes;
40920 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40921 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40922 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40923
40924 if (NumEltsPerLane >= 4 &&
40925 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40926 SmallVector<int> LaneMask, ScaledMask;
40927 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40928 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40929 // See if we can remove the shuffle by resorting the HOP chain so that
40930 // the HOP args are pre-shuffled.
40931 // TODO: Generalize to any sized/depth chain.
40932 // TODO: Add support for PACKSS/PACKUS.
40933 if (isHoriz) {
40934 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40935 auto GetHOpSrc = [&](int M) {
40936 if (M == SM_SentinelUndef)
40937 return DAG.getUNDEF(VT0);
40938 if (M == SM_SentinelZero)
40939 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40940 SDValue Src0 = BC[M / 4];
40941 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40942 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40943 return Src1.getOperand(M % 2);
40944 return SDValue();
40945 };
40946 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40947 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40948 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40949 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40950 if (M0 && M1 && M2 && M3) {
40951 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40952 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40953 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40954 }
40955 }
40956 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40957 if (Ops.size() >= 2) {
40958 SDValue LHS, RHS;
40959 auto GetHOpSrc = [&](int M, int &OutM) {
40960 // TODO: Support SM_SentinelZero
40961 if (M < 0)
40962 return M == SM_SentinelUndef;
40963 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40964 if (!LHS || LHS == Src) {
40965 LHS = Src;
40966 OutM = (M % 2);
40967 return true;
40968 }
40969 if (!RHS || RHS == Src) {
40970 RHS = Src;
40971 OutM = (M % 2) + 2;
40972 return true;
40973 }
40974 return false;
40975 };
40976 int PostMask[4] = {-1, -1, -1, -1};
40977 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40978 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40979 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40980 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40981 LHS = DAG.getBitcast(SrcVT, LHS);
40982 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40983 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40984 // Use SHUFPS for the permute so this will work on SSE2 targets,
40985 // shuffle combining and domain handling will simplify this later on.
40986 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40987 Res = DAG.getBitcast(ShuffleVT, Res);
40988 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40989 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40990 }
40991 }
40992 }
40993 }
40994
40995 if (2 < Ops.size())
40996 return SDValue();
40997
40998 SDValue BC1 = BC[BC.size() - 1];
40999 if (Mask.size() == VT0.getVectorNumElements()) {
41000 // Canonicalize binary shuffles of horizontal ops that use the
41001 // same sources to an unary shuffle.
41002 // TODO: Try to perform this fold even if the shuffle remains.
41003 if (Ops.size() == 2) {
41004 auto ContainsOps = [](SDValue HOp, SDValue Op) {
41005 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
41006 };
41007 // Commute if all BC0's ops are contained in BC1.
41008 if (ContainsOps(BC1, BC0.getOperand(0)) &&
41009 ContainsOps(BC1, BC0.getOperand(1))) {
41011 std::swap(Ops[0], Ops[1]);
41012 std::swap(BC0, BC1);
41013 }
41014
41015 // If BC1 can be represented by BC0, then convert to unary shuffle.
41016 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41017 ContainsOps(BC0, BC1.getOperand(1))) {
41018 for (int &M : Mask) {
41019 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41020 continue;
41021 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41022 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41023 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41024 M += NumHalfEltsPerLane;
41025 }
41026 }
41027 }
41028
41029 // Canonicalize unary horizontal ops to only refer to lower halves.
41030 for (int i = 0; i != NumElts; ++i) {
41031 int &M = Mask[i];
41032 if (isUndefOrZero(M))
41033 continue;
41034 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41035 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41036 M -= NumHalfEltsPerLane;
41037 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41038 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41039 M -= NumHalfEltsPerLane;
41040 }
41041 }
41042
41043 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41044 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41045 // represents the LHS/RHS inputs for the lower/upper halves.
41046 SmallVector<int, 16> TargetMask128, WideMask128;
41047 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41048 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41049 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41050 bool SingleOp = (Ops.size() == 1);
41051 if (isPack || OneUseOps ||
41052 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41053 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41054 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41055 Lo = Lo.getOperand(WideMask128[0] & 1);
41056 Hi = Hi.getOperand(WideMask128[1] & 1);
41057 if (SingleOp) {
41058 SDValue Undef = DAG.getUNDEF(SrcVT);
41059 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41060 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41061 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41062 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41063 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41064 }
41065 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41066 }
41067 }
41068
41069 // If we are post-shuffling a 256-bit hop and not requiring the upper
41070 // elements, then try to narrow to a 128-bit hop directly.
41071 SmallVector<int, 16> WideMask64;
41072 if (Ops.size() == 1 && NumLanes == 2 &&
41073 scaleShuffleElements(Mask, 4, WideMask64) &&
41074 isUndefInRange(WideMask64, 2, 2)) {
41075 int M0 = WideMask64[0];
41076 int M1 = WideMask64[1];
41077 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41079 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41080 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41081 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41082 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41083 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41084 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41085 }
41086 }
41087
41088 return SDValue();
41089}
41090
41091// Attempt to constant fold all of the constant source ops.
41092// Returns true if the entire shuffle is folded to a constant.
41093// TODO: Extend this to merge multiple constant Ops and update the mask.
41095 ArrayRef<int> Mask,
41096 ArrayRef<const SDNode *> SrcNodes,
41097 SelectionDAG &DAG, const SDLoc &DL,
41098 const X86Subtarget &Subtarget) {
41099 unsigned SizeInBits = VT.getSizeInBits();
41100 unsigned NumMaskElts = Mask.size();
41101 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41102 unsigned NumOps = Ops.size();
41103
41104 // Extract constant bits from each source op.
41105 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41107 for (unsigned I = 0; I != NumOps; ++I)
41108 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41109 RawBitsOps[I],
41110 /*AllowWholeUndefs*/ true,
41111 /*AllowPartialUndefs*/ true))
41112 return SDValue();
41113
41114 // If we're optimizing for size, only fold if at least one of the constants is
41115 // only used once or the combined shuffle has included a variable mask
41116 // shuffle, this is to avoid constant pool bloat.
41117 bool IsOptimizingSize = DAG.shouldOptForSize();
41118 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41119 return isTargetShuffleVariableMask(N->getOpcode());
41120 });
41121 if (IsOptimizingSize && !HasVariableMask &&
41122 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41123 return SDValue();
41124
41125 // Shuffle the constant bits according to the mask.
41126 APInt UndefElts(NumMaskElts, 0);
41127 APInt ZeroElts(NumMaskElts, 0);
41128 APInt ConstantElts(NumMaskElts, 0);
41129 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41130 APInt::getZero(MaskSizeInBits));
41131 for (unsigned i = 0; i != NumMaskElts; ++i) {
41132 int M = Mask[i];
41133 if (M == SM_SentinelUndef) {
41134 UndefElts.setBit(i);
41135 continue;
41136 } else if (M == SM_SentinelZero) {
41137 ZeroElts.setBit(i);
41138 continue;
41139 }
41140 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41141
41142 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41143 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41144
41145 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41146 if (SrcUndefElts[SrcMaskIdx]) {
41147 UndefElts.setBit(i);
41148 continue;
41149 }
41150
41151 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41152 APInt &Bits = SrcEltBits[SrcMaskIdx];
41153 if (!Bits) {
41154 ZeroElts.setBit(i);
41155 continue;
41156 }
41157
41158 ConstantElts.setBit(i);
41159 ConstantBitData[i] = Bits;
41160 }
41161 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41162
41163 // Attempt to create a zero vector.
41164 if ((UndefElts | ZeroElts).isAllOnes())
41165 return getZeroVector(VT, Subtarget, DAG, DL);
41166
41167 // Create the constant data.
41168 MVT MaskSVT;
41169 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41170 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41171 else
41172 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41173
41174 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41175 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41176 return SDValue();
41177
41178 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41179 return DAG.getBitcast(VT, CstOp);
41180}
41181
41182namespace llvm {
41183 namespace X86 {
41184 enum {
41186 };
41187 } // namespace X86
41188} // namespace llvm
41189
41190/// Fully generic combining of x86 shuffle instructions.
41191///
41192/// This should be the last combine run over the x86 shuffle instructions. Once
41193/// they have been fully optimized, this will recursively consider all chains
41194/// of single-use shuffle instructions, build a generic model of the cumulative
41195/// shuffle operation, and check for simpler instructions which implement this
41196/// operation. We use this primarily for two purposes:
41197///
41198/// 1) Collapse generic shuffles to specialized single instructions when
41199/// equivalent. In most cases, this is just an encoding size win, but
41200/// sometimes we will collapse multiple generic shuffles into a single
41201/// special-purpose shuffle.
41202/// 2) Look for sequences of shuffle instructions with 3 or more total
41203/// instructions, and replace them with the slightly more expensive SSSE3
41204/// PSHUFB instruction if available. We do this as the last combining step
41205/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41206/// a suitable short sequence of other instructions. The PSHUFB will either
41207/// use a register or have to read from memory and so is slightly (but only
41208/// slightly) more expensive than the other shuffle instructions.
41209///
41210/// Because this is inherently a quadratic operation (for each shuffle in
41211/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41212/// This should never be an issue in practice as the shuffle lowering doesn't
41213/// produce sequences of more than 8 instructions.
41214///
41215/// FIXME: We will currently miss some cases where the redundant shuffling
41216/// would simplify under the threshold for PSHUFB formation because of
41217/// combine-ordering. To fix this, we should do the redundant instruction
41218/// combining in this recursive walk.
41220 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41221 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41222 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41223 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41224 const SDLoc &DL, const X86Subtarget &Subtarget) {
41225 assert(!RootMask.empty() &&
41226 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41227 "Illegal shuffle root mask");
41228 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41229 unsigned RootSizeInBits = RootVT.getSizeInBits();
41230 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41231
41232 // Bound the depth of our recursive combine because this is ultimately
41233 // quadratic in nature.
41234 if (Depth >= MaxDepth)
41235 return SDValue();
41236
41237 // Directly rip through bitcasts to find the underlying operand.
41238 SDValue Op = SrcOps[SrcOpIndex];
41240
41241 EVT VT = Op.getValueType();
41242 if (!VT.isVector() || !VT.isSimple())
41243 return SDValue(); // Bail if we hit a non-simple non-vector.
41244
41245 // FIXME: Just bail on f16 for now.
41246 if (VT.getVectorElementType() == MVT::f16)
41247 return SDValue();
41248
41249 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41250 "Can only combine shuffles upto size of the root op.");
41251
41252 // Create a demanded elts mask from the referenced elements of Op.
41253 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41254 for (int M : RootMask) {
41255 int BaseIdx = RootMask.size() * SrcOpIndex;
41256 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41257 OpDemandedElts.setBit(M - BaseIdx);
41258 }
41259 if (RootSizeInBits != VT.getSizeInBits()) {
41260 // Op is smaller than Root - extract the demanded elts for the subvector.
41261 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41262 unsigned NumOpMaskElts = RootMask.size() / Scale;
41263 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41264 assert(OpDemandedElts
41265 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41266 .isZero() &&
41267 "Out of range elements referenced in root mask");
41268 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41269 }
41270 OpDemandedElts =
41271 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41272
41273 // Extract target shuffle mask and resolve sentinels and inputs.
41274 SmallVector<int, 64> OpMask;
41275 SmallVector<SDValue, 2> OpInputs;
41276 APInt OpUndef, OpZero;
41277 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41278 OpZero, DAG, Depth, false)) {
41279 // Shuffle inputs must not be larger than the shuffle result.
41280 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41281 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41282 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41283 }))
41284 return SDValue();
41285 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41286 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41287 !isNullConstant(Op.getOperand(1))) {
41288 SDValue SrcVec = Op.getOperand(0);
41289 int ExtractIdx = Op.getConstantOperandVal(1);
41290 unsigned NumElts = VT.getVectorNumElements();
41291 OpInputs.assign({SrcVec});
41292 OpMask.assign(NumElts, SM_SentinelUndef);
41293 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41294 OpZero = OpUndef = APInt::getZero(NumElts);
41295 } else {
41296 return SDValue();
41297 }
41298
41299 // If the shuffle result was smaller than the root, we need to adjust the
41300 // mask indices and pad the mask with undefs.
41301 if (RootSizeInBits > VT.getSizeInBits()) {
41302 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41303 unsigned OpMaskSize = OpMask.size();
41304 if (OpInputs.size() > 1) {
41305 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41306 for (int &M : OpMask) {
41307 if (M < 0)
41308 continue;
41309 int EltIdx = M % OpMaskSize;
41310 int OpIdx = M / OpMaskSize;
41311 M = (PaddedMaskSize * OpIdx) + EltIdx;
41312 }
41313 }
41314 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41315 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41316 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41317 }
41318
41321
41322 // We don't need to merge masks if the root is empty.
41323 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41324 if (EmptyRoot) {
41325 // Only resolve zeros if it will remove an input, otherwise we might end
41326 // up in an infinite loop.
41327 bool ResolveKnownZeros = true;
41328 if (!OpZero.isZero()) {
41329 APInt UsedInputs = APInt::getZero(OpInputs.size());
41330 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41331 int M = OpMask[i];
41332 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41333 continue;
41334 UsedInputs.setBit(M / OpMask.size());
41335 if (UsedInputs.isAllOnes()) {
41336 ResolveKnownZeros = false;
41337 break;
41338 }
41339 }
41340 }
41341 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41342 ResolveKnownZeros);
41343
41344 Mask = OpMask;
41345 Ops.append(OpInputs.begin(), OpInputs.end());
41346 } else {
41347 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41348
41349 // Add the inputs to the Ops list, avoiding duplicates.
41350 Ops.append(SrcOps.begin(), SrcOps.end());
41351
41352 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41353 // Attempt to find an existing match.
41355 for (int i = 0, e = Ops.size(); i < e; ++i)
41356 if (InputBC == peekThroughBitcasts(Ops[i]))
41357 return i;
41358 // Match failed - should we replace an existing Op?
41359 if (InsertionPoint >= 0) {
41361 return InsertionPoint;
41362 }
41363 // Add to the end of the Ops list.
41364 Ops.push_back(Input);
41365 return Ops.size() - 1;
41366 };
41367
41368 SmallVector<int, 2> OpInputIdx;
41369 for (SDValue OpInput : OpInputs)
41370 OpInputIdx.push_back(
41371 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41372
41373 assert(((RootMask.size() > OpMask.size() &&
41374 RootMask.size() % OpMask.size() == 0) ||
41375 (OpMask.size() > RootMask.size() &&
41376 OpMask.size() % RootMask.size() == 0) ||
41377 OpMask.size() == RootMask.size()) &&
41378 "The smaller number of elements must divide the larger.");
41379
41380 // This function can be performance-critical, so we rely on the power-of-2
41381 // knowledge that we have about the mask sizes to replace div/rem ops with
41382 // bit-masks and shifts.
41384 "Non-power-of-2 shuffle mask sizes");
41386 "Non-power-of-2 shuffle mask sizes");
41387 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41388 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41389
41390 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41391 unsigned RootRatio =
41392 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41393 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41394 assert((RootRatio == 1 || OpRatio == 1) &&
41395 "Must not have a ratio for both incoming and op masks!");
41396
41397 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41398 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41399 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41400 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41401 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41402
41403 Mask.resize(MaskWidth, SM_SentinelUndef);
41404
41405 // Merge this shuffle operation's mask into our accumulated mask. Note that
41406 // this shuffle's mask will be the first applied to the input, followed by
41407 // the root mask to get us all the way to the root value arrangement. The
41408 // reason for this order is that we are recursing up the operation chain.
41409 for (unsigned i = 0; i < MaskWidth; ++i) {
41410 unsigned RootIdx = i >> RootRatioLog2;
41411 if (RootMask[RootIdx] < 0) {
41412 // This is a zero or undef lane, we're done.
41413 Mask[i] = RootMask[RootIdx];
41414 continue;
41415 }
41416
41417 unsigned RootMaskedIdx =
41418 RootRatio == 1
41419 ? RootMask[RootIdx]
41420 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41421
41422 // Just insert the scaled root mask value if it references an input other
41423 // than the SrcOp we're currently inserting.
41424 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41425 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41426 Mask[i] = RootMaskedIdx;
41427 continue;
41428 }
41429
41430 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41431 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41432 if (OpMask[OpIdx] < 0) {
41433 // The incoming lanes are zero or undef, it doesn't matter which ones we
41434 // are using.
41435 Mask[i] = OpMask[OpIdx];
41436 continue;
41437 }
41438
41439 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41440 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41441 : (OpMask[OpIdx] << OpRatioLog2) +
41442 (RootMaskedIdx & (OpRatio - 1));
41443
41444 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41445 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41446 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41447 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41448
41449 Mask[i] = OpMaskedIdx;
41450 }
41451 }
41452
41453 // Peek through any free bitcasts to insert_subvector vector widenings or
41454 // extract_subvector nodes back to root size.
41455 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41456 for (auto [I, Op] : enumerate(Ops)) {
41457 SDValue BC = Op;
41458 while (1) {
41459 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41460 BC = BC.getOperand(0);
41461 continue;
41462 }
41463 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41464 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41465 // Set out of bounds mask indices to undef.
41466 Op = BC = BC.getOperand(1);
41467 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41468 int Lo = I * Mask.size();
41469 int Hi = (I + 1) * Mask.size();
41470 int NewHi = Lo + (Mask.size() / Scale);
41471 for (int &M : Mask) {
41472 if (Lo <= M && NewHi <= M && M < Hi)
41473 M = SM_SentinelUndef;
41474 }
41475 continue;
41476 }
41477 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41478 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41479 isNullConstant(BC.getOperand(1))) {
41480 Op = BC = BC.getOperand(0);
41481 continue;
41482 }
41483 break;
41484 }
41485 }
41486
41487 // Remove unused/repeated shuffle source ops.
41489
41490 // Handle the all undef/zero/ones cases early.
41491 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41492 return DAG.getUNDEF(RootVT);
41493 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41494 return getZeroVector(RootVT, Subtarget, DAG, DL);
41495 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41497 return getOnesVector(RootVT, DAG, DL);
41498
41499 assert(!Ops.empty() && "Shuffle with no inputs detected");
41500
41501 // Update the list of shuffle nodes that have been combined so far.
41502 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41503 CombinedNodes.push_back(Op.getNode());
41504
41505 // See if we can recurse into each shuffle source op (if it's a target
41506 // shuffle). The source op should only be generally combined if it either has
41507 // a single use (i.e. current Op) or all its users have already been combined,
41508 // if not then we can still combine but should prevent generation of variable
41509 // shuffles to avoid constant pool bloat.
41510 // Don't recurse if we already have more source ops than we can combine in
41511 // the remaining recursion depth.
41512 if (Ops.size() < (MaxDepth - Depth)) {
41513 for (int i = 0, e = Ops.size(); i < e; ++i) {
41514 // For empty roots, we need to resolve zeroable elements before combining
41515 // them with other shuffles.
41516 SmallVector<int, 64> ResolvedMask = Mask;
41517 if (EmptyRoot)
41518 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41519 bool AllowCrossLaneVar = false;
41520 bool AllowPerLaneVar = false;
41521 if (Ops[i].getNode()->hasOneUse() ||
41522 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41523 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41524 AllowPerLaneVar = AllowVariablePerLaneMask;
41525 }
41527 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41528 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41529 DAG, DL, Subtarget))
41530 return Res;
41531 }
41532 }
41533
41534 // Attempt to constant fold all of the constant source ops.
41536 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41537 return Cst;
41538
41539 // If constant fold failed and we only have constants - then we have
41540 // multiple uses by a single non-variable shuffle - just bail.
41541 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41542 APInt UndefElts;
41543 SmallVector<APInt> RawBits;
41544 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41545 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41546 RawBits,
41547 /*AllowWholeUndefs*/ true,
41548 /*AllowPartialUndefs*/ true);
41549 })) {
41550 return SDValue();
41551 }
41552
41553 // Canonicalize the combined shuffle mask chain with horizontal ops.
41554 // NOTE: This will update the Ops and Mask.
41556 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41557 return DAG.getBitcast(RootVT, HOp);
41558
41559 // Try to refine our inputs given our knowledge of target shuffle mask.
41560 for (auto I : enumerate(Ops)) {
41561 int OpIdx = I.index();
41562 SDValue &Op = I.value();
41563
41564 // What range of shuffle mask element values results in picking from Op?
41565 int Lo = OpIdx * Mask.size();
41566 int Hi = Lo + Mask.size();
41567
41568 // Which elements of Op do we demand, given the mask's granularity?
41569 APInt OpDemandedElts(Mask.size(), 0);
41570 for (int MaskElt : Mask) {
41571 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41572 int OpEltIdx = MaskElt - Lo;
41573 OpDemandedElts.setBit(OpEltIdx);
41574 }
41575 }
41576
41577 // Is the shuffle result smaller than the root?
41578 if (Op.getValueSizeInBits() < RootSizeInBits) {
41579 // We padded the mask with undefs. But we now need to undo that.
41580 unsigned NumExpectedVectorElts = Mask.size();
41581 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41582 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41583 assert(!OpDemandedElts.extractBits(
41584 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41585 "Demanding the virtual undef widening padding?");
41586 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41587 }
41588
41589 // The Op itself may be of different VT, so we need to scale the mask.
41590 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41591 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41592
41593 // Can this operand be simplified any further, given it's demanded elements?
41595 Op, OpScaledDemandedElts, DAG))
41596 Op = NewOp;
41597 }
41598 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41599
41600 // Widen any subvector shuffle inputs we've collected.
41601 // TODO: Remove this to avoid generating temporary nodes, we should only
41602 // widen once combineX86ShuffleChain has found a match.
41603 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41604 return Op.getValueSizeInBits() < RootSizeInBits;
41605 })) {
41606 for (SDValue &Op : Ops)
41607 if (Op.getValueSizeInBits() < RootSizeInBits)
41608 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41609 RootSizeInBits);
41610 // Reresolve - we might have repeated subvector sources.
41612 }
41613
41614 // Handle the all undef/zero/ones cases.
41615 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41616 return DAG.getUNDEF(RootVT);
41617 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41618 return getZeroVector(RootVT, Subtarget, DAG, DL);
41619 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41621 return getOnesVector(RootVT, DAG, DL);
41622
41623 assert(!Ops.empty() && "Shuffle with no inputs detected");
41624
41625 // We can only combine unary and binary shuffle mask cases.
41626 if (Ops.size() <= 2) {
41627 // Minor canonicalization of the accumulated shuffle mask to make it easier
41628 // to match below. All this does is detect masks with sequential pairs of
41629 // elements, and shrink them to the half-width mask. It does this in a loop
41630 // so it will reduce the size of the mask to the minimal width mask which
41631 // performs an equivalent shuffle.
41632 while (Mask.size() > 1) {
41633 SmallVector<int, 64> WidenedMask;
41634 if (!canWidenShuffleElements(Mask, WidenedMask))
41635 break;
41636 Mask = std::move(WidenedMask);
41637 }
41638
41639 // Canonicalization of binary shuffle masks to improve pattern matching by
41640 // commuting the inputs.
41641 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41643 std::swap(Ops[0], Ops[1]);
41644 }
41645
41646 // Try to combine into a single shuffle instruction.
41647 if (SDValue Shuffle = combineX86ShuffleChain(
41648 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41649 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41650 IsMaskedShuffle, DAG, DL, Subtarget))
41651 return Shuffle;
41652
41653 // If all the operands come from the same larger vector, fallthrough and try
41654 // to use combineX86ShuffleChainWithExtract.
41657 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41658 (RootSizeInBits / Mask.size()) != 64 ||
41659 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41660 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41661 LHS.getOperand(0) != RHS.getOperand(0))
41662 return SDValue();
41663 }
41664
41665 // If that failed and any input is extracted then try to combine as a
41666 // shuffle with the larger type.
41668 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41669 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41670 DAG, DL, Subtarget);
41671}
41672
41673/// Helper entry wrapper to combineX86ShufflesRecursively.
41675 const X86Subtarget &Subtarget) {
41677 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41678 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41679 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41680 SDLoc(Op), Subtarget);
41681}
41682
41683/// Get the PSHUF-style mask from PSHUF node.
41684///
41685/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41686/// PSHUF-style masks that can be reused with such instructions.
41688 MVT VT = N.getSimpleValueType();
41691 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41692 (void)HaveMask;
41693 assert(HaveMask);
41694
41695 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41696 // matter. Check that the upper masks are repeats and remove them.
41697 if (VT.getSizeInBits() > 128) {
41698 int LaneElts = 128 / VT.getScalarSizeInBits();
41699#ifndef NDEBUG
41700 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41701 for (int j = 0; j < LaneElts; ++j)
41702 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41703 "Mask doesn't repeat in high 128-bit lanes!");
41704#endif
41705 Mask.resize(LaneElts);
41706 }
41707
41708 switch (N.getOpcode()) {
41709 case X86ISD::PSHUFD:
41710 return Mask;
41711 case X86ISD::PSHUFLW:
41712 Mask.resize(4);
41713 return Mask;
41714 case X86ISD::PSHUFHW:
41715 Mask.erase(Mask.begin(), Mask.begin() + 4);
41716 for (int &M : Mask)
41717 M -= 4;
41718 return Mask;
41719 default:
41720 llvm_unreachable("No valid shuffle instruction found!");
41721 }
41722}
41723
41724/// Get the expanded blend mask from a BLENDI node.
41725/// For v16i16 nodes, this will splat the repeated i8 mask.
41727 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41728 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41729 APInt Mask = V.getConstantOperandAPInt(2);
41730 if (Mask.getBitWidth() > NumElts)
41731 Mask = Mask.trunc(NumElts);
41732 if (NumElts == 16) {
41733 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41734 Mask = APInt::getSplat(16, Mask);
41735 }
41736 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41737 return Mask;
41738}
41739
41740/// Search for a combinable shuffle across a chain ending in pshufd.
41741///
41742/// We walk up the chain and look for a combinable shuffle, skipping over
41743/// shuffles that we could hoist this shuffle's transformation past without
41744/// altering anything.
41747 const SDLoc &DL,
41748 SelectionDAG &DAG) {
41749 assert(N.getOpcode() == X86ISD::PSHUFD &&
41750 "Called with something other than an x86 128-bit half shuffle!");
41751
41752 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41753 // of the shuffles in the chain so that we can form a fresh chain to replace
41754 // this one.
41756 SDValue V = N.getOperand(0);
41757 for (; V.hasOneUse(); V = V.getOperand(0)) {
41758 switch (V.getOpcode()) {
41759 default:
41760 return SDValue(); // Nothing combined!
41761
41762 case ISD::BITCAST:
41763 // Skip bitcasts as we always know the type for the target specific
41764 // instructions.
41765 continue;
41766
41767 case X86ISD::PSHUFD:
41768 // Found another dword shuffle.
41769 break;
41770
41771 case X86ISD::PSHUFLW:
41772 // Check that the low words (being shuffled) are the identity in the
41773 // dword shuffle, and the high words are self-contained.
41774 if (Mask[0] != 0 || Mask[1] != 1 ||
41775 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41776 return SDValue();
41777
41778 Chain.push_back(V);
41779 continue;
41780
41781 case X86ISD::PSHUFHW:
41782 // Check that the high words (being shuffled) are the identity in the
41783 // dword shuffle, and the low words are self-contained.
41784 if (Mask[2] != 2 || Mask[3] != 3 ||
41785 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41786 return SDValue();
41787
41788 Chain.push_back(V);
41789 continue;
41790
41791 case X86ISD::UNPCKL:
41792 case X86ISD::UNPCKH:
41793 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41794 // shuffle into a preceding word shuffle.
41795 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41796 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41797 return SDValue();
41798
41799 // Search for a half-shuffle which we can combine with.
41800 unsigned CombineOp =
41801 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41802 if (V.getOperand(0) != V.getOperand(1) ||
41803 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41804 return SDValue();
41805 Chain.push_back(V);
41806 V = V.getOperand(0);
41807 do {
41808 switch (V.getOpcode()) {
41809 default:
41810 return SDValue(); // Nothing to combine.
41811
41812 case X86ISD::PSHUFLW:
41813 case X86ISD::PSHUFHW:
41814 if (V.getOpcode() == CombineOp)
41815 break;
41816
41817 Chain.push_back(V);
41818
41819 [[fallthrough]];
41820 case ISD::BITCAST:
41821 V = V.getOperand(0);
41822 continue;
41823 }
41824 break;
41825 } while (V.hasOneUse());
41826 break;
41827 }
41828 // Break out of the loop if we break out of the switch.
41829 break;
41830 }
41831
41832 if (!V.hasOneUse())
41833 // We fell out of the loop without finding a viable combining instruction.
41834 return SDValue();
41835
41836 // Merge this node's mask and our incoming mask.
41838 for (int &M : Mask)
41839 M = VMask[M];
41840 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41841 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41842
41843 // Rebuild the chain around this new shuffle.
41844 while (!Chain.empty()) {
41845 SDValue W = Chain.pop_back_val();
41846
41847 if (V.getValueType() != W.getOperand(0).getValueType())
41848 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41849
41850 switch (W.getOpcode()) {
41851 default:
41852 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41853
41854 case X86ISD::UNPCKL:
41855 case X86ISD::UNPCKH:
41856 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41857 break;
41858
41859 case X86ISD::PSHUFD:
41860 case X86ISD::PSHUFLW:
41861 case X86ISD::PSHUFHW:
41862 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41863 break;
41864 }
41865 }
41866 if (V.getValueType() != N.getValueType())
41867 V = DAG.getBitcast(N.getValueType(), V);
41868
41869 // Return the new chain to replace N.
41870 return V;
41871}
41872
41873// Attempt to commute shufps LHS loads:
41874// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41876 SelectionDAG &DAG) {
41877 // TODO: Add vXf64 support.
41878 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41879 return SDValue();
41880
41881 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41882 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41883 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41884 return SDValue();
41885 SDValue N0 = V.getOperand(0);
41886 SDValue N1 = V.getOperand(1);
41887 unsigned Imm = V.getConstantOperandVal(2);
41888 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41889 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41891 return SDValue();
41892 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41893 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41894 DAG.getTargetConstant(Imm, DL, MVT::i8));
41895 };
41896
41897 switch (N.getOpcode()) {
41898 case X86ISD::VPERMILPI:
41899 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41900 unsigned Imm = N.getConstantOperandVal(1);
41901 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41902 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41903 }
41904 break;
41905 case X86ISD::SHUFP: {
41906 SDValue N0 = N.getOperand(0);
41907 SDValue N1 = N.getOperand(1);
41908 unsigned Imm = N.getConstantOperandVal(2);
41909 if (N0 == N1) {
41910 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41911 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41912 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41913 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41914 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41915 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41916 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41917 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41918 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41919 }
41920 break;
41921 }
41922 }
41923
41924 return SDValue();
41925}
41926
41927// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41928// iff we don't demand the same element index for both X and Y.
41929static SDValue
41931 const APInt &DemandedElts, SelectionDAG &DAG,
41932 const X86Subtarget &Subtarget, const SDLoc &DL) {
41933 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41934 if (!N0.hasOneUse() || !N1.hasOneUse())
41935 return SDValue();
41936
41937 unsigned NumElts = VT.getVectorNumElements();
41940
41941 // See if both operands are shuffles, and that we can scale the shuffle masks
41942 // to the same width as the blend mask.
41943 // TODO: Support SM_SentinelZero?
41944 SmallVector<SDValue, 2> Ops0, Ops1;
41945 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41946 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41947 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41948 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41949 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41950 return SDValue();
41951
41952 // Determine the demanded elts from both permutes.
41953 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41954 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41955 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41956 Demanded1,
41957 /*AllowUndefElts=*/true) ||
41958 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41959 DemandedRHS0, /*AllowUndefElts=*/true) ||
41960 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41961 DemandedRHS1, /*AllowUndefElts=*/true))
41962 return SDValue();
41963
41964 // Confirm that we only use a single operand from both permutes and that we
41965 // don't demand the same index from both.
41966 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41967 DemandedLHS0.intersects(DemandedLHS1))
41968 return SDValue();
41969
41970 // Use the permute demanded elts masks as the new blend mask.
41971 // Create the new permute mask as a blend of the 2 original permute masks.
41972 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41973 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41974 for (unsigned I = 0; I != NumElts; ++I) {
41975 if (Demanded0[I]) {
41976 int M = ScaledMask0[I];
41977 if (0 <= M) {
41978 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41979 "BlendMask demands LHS AND RHS");
41980 NewBlendMask[M] = M;
41981 NewPermuteMask[I] = M;
41982 }
41983 } else if (Demanded1[I]) {
41984 int M = ScaledMask1[I];
41985 if (0 <= M) {
41986 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41987 "BlendMask demands LHS AND RHS");
41988 NewBlendMask[M] = M + NumElts;
41989 NewPermuteMask[I] = M;
41990 }
41991 }
41992 }
41993 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41994 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41995
41996 // v16i16 shuffles can explode in complexity very easily, only accept them if
41997 // the blend mask is the same in the 128-bit subvectors (or can widen to
41998 // v8i32) and the permute can be widened as well.
41999 if (VT == MVT::v16i16) {
42000 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
42001 !canWidenShuffleElements(NewBlendMask))
42002 return SDValue();
42003 if (!canWidenShuffleElements(NewPermuteMask))
42004 return SDValue();
42005 }
42006
42007 // Don't introduce lane-crossing permutes without AVX2, unless it can be
42008 // widened to a lane permute (vperm2f128).
42009 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
42011 NewPermuteMask) &&
42012 !canScaleShuffleElements(NewPermuteMask, 2))
42013 return SDValue();
42014
42015 SDValue NewBlend =
42016 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42017 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42018 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42019 NewPermuteMask);
42020}
42021
42022// TODO - move this to TLI like isBinOp?
42023static bool isUnaryOp(unsigned Opcode) {
42024 switch (Opcode) {
42025 case ISD::CTLZ:
42026 case ISD::CTTZ:
42027 case ISD::CTPOP:
42028 return true;
42029 }
42030 return false;
42031}
42032
42033// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42034// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42036 const SDLoc &DL) {
42037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42038 EVT ShuffleVT = N.getValueType();
42039 unsigned Opc = N.getOpcode();
42040
42041 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42042 // AllZeros/AllOnes constants are freely shuffled and will peek through
42043 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42044 // merge with target shuffles if it has one use so shuffle combining is
42045 // likely to kick in. Shuffles of splats are expected to be removed.
42046 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42047 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42051 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42052 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42053 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42054 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42055 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42056 };
42057 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42058 // Ensure we only shuffle whole vector src elements, unless its a logical
42059 // binops where we can more aggressively move shuffles from dst to src.
42060 return isLogicOp(BinOp) ||
42061 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42062 };
42063
42064 switch (Opc) {
42065 // Unary and Unary+Permute Shuffles.
42066 case X86ISD::PSHUFB: {
42067 // Don't merge PSHUFB if it contains zero'd elements.
42068 SmallVector<int> Mask;
42070 if (!getTargetShuffleMask(N, false, Ops, Mask))
42071 break;
42072 [[fallthrough]];
42073 }
42074 case X86ISD::VBROADCAST:
42075 case X86ISD::MOVDDUP:
42076 case X86ISD::PSHUFD:
42077 case X86ISD::PSHUFHW:
42078 case X86ISD::PSHUFLW:
42079 case X86ISD::VPERMV:
42080 case X86ISD::VPERMI:
42081 case X86ISD::VPERMILPI: {
42082 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42083 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42084 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42085 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42086 unsigned SrcOpcode = N0.getOpcode();
42087 EVT OpVT = N0.getValueType();
42088 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42091 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42092 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42093 IsMergeableWithShuffle(Op01, FoldShuf)) {
42094 SDValue LHS, RHS;
42095 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42096 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42097 if (Opc == X86ISD::VPERMV) {
42098 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42099 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42100 } else if (N.getNumOperands() == 2) {
42101 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42102 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42103 } else {
42104 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42105 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42106 }
42107 return DAG.getBitcast(ShuffleVT,
42108 DAG.getNode(SrcOpcode, DL, OpVT,
42109 DAG.getBitcast(OpVT, LHS),
42110 DAG.getBitcast(OpVT, RHS)));
42111 }
42112 }
42113 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42114 OpVT.getScalarSizeInBits() ==
42116 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42117 if (Opc == X86ISD::VPERMV)
42118 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42119 else if (N.getNumOperands() == 2)
42120 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42121 else
42122 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42123 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42124 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42125 }
42126 }
42127 break;
42128 }
42129 // Binary and Binary+Permute Shuffles.
42130 case X86ISD::INSERTPS: {
42131 // Don't merge INSERTPS if it contains zero'd elements.
42132 unsigned InsertPSMask = N.getConstantOperandVal(2);
42133 unsigned ZeroMask = InsertPSMask & 0xF;
42134 if (ZeroMask != 0)
42135 break;
42136 [[fallthrough]];
42137 }
42138 case X86ISD::MOVSD:
42139 case X86ISD::MOVSS:
42140 case X86ISD::BLENDI:
42141 case X86ISD::SHUFP:
42142 case X86ISD::UNPCKH:
42143 case X86ISD::UNPCKL: {
42144 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42145 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42146 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42147 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42148 unsigned SrcOpcode = N0.getOpcode();
42149 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42150 N0.getValueType() == N1.getValueType() &&
42151 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42152 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42157 // Ensure the total number of shuffles doesn't increase by folding this
42158 // shuffle through to the source ops.
42159 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42160 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42161 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42162 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42163 SDValue LHS, RHS;
42164 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42165 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42166 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42167 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42168 if (N.getNumOperands() == 3) {
42169 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42170 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42171 } else {
42172 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42173 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42174 }
42175 EVT OpVT = N0.getValueType();
42176 return DAG.getBitcast(ShuffleVT,
42177 DAG.getNode(SrcOpcode, DL, OpVT,
42178 DAG.getBitcast(OpVT, LHS),
42179 DAG.getBitcast(OpVT, RHS)));
42180 }
42181 }
42182 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42183 N0.getValueType() == N1.getValueType() &&
42184 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42185 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42188 SDValue Res;
42189 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42190 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42191 if (N.getNumOperands() == 3) {
42192 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42193 } else {
42194 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42195 }
42196 EVT OpVT = N0.getValueType();
42197 return DAG.getBitcast(
42198 ShuffleVT,
42199 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42200 }
42201 // TODO: We can generalize this for other shuffles/conversions.
42202 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42203 N1.getOpcode() == SrcOpcode &&
42204 N0.getValueType() == N1.getValueType() &&
42205 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42206 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42207 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42208 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42209 EVT OpSrcVT = N0.getOperand(0).getValueType();
42210 EVT OpDstVT = N0.getValueType();
42211 SDValue Res =
42212 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42213 return DAG.getBitcast(ShuffleVT,
42214 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42215 }
42216 }
42217 break;
42218 }
42219 }
42220 return SDValue();
42221}
42222
42223/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42225 SelectionDAG &DAG,
42226 const SDLoc &DL) {
42227 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42228
42229 MVT VT = V.getSimpleValueType();
42230 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42231 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42232 unsigned SrcOpc0 = Src0.getOpcode();
42233 unsigned SrcOpc1 = Src1.getOpcode();
42234 EVT SrcVT0 = Src0.getValueType();
42235 EVT SrcVT1 = Src1.getValueType();
42236
42237 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42238 return SDValue();
42239
42240 switch (SrcOpc0) {
42241 case X86ISD::MOVDDUP: {
42242 SDValue LHS = Src0.getOperand(0);
42243 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42244 SDValue Res =
42245 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42246 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42247 return DAG.getBitcast(VT, Res);
42248 }
42249 case X86ISD::VPERMILPI:
42250 // TODO: Handle v4f64 permutes with different low/high lane masks.
42251 if (SrcVT0 == MVT::v4f64) {
42252 uint64_t Mask = Src0.getConstantOperandVal(1);
42253 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42254 break;
42255 }
42256 [[fallthrough]];
42257 case X86ISD::VSHLI:
42258 case X86ISD::VSRLI:
42259 case X86ISD::VSRAI:
42260 case X86ISD::PSHUFD:
42261 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42262 SDValue LHS = Src0.getOperand(0);
42263 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42264 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42265 V.getOperand(2));
42266 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42267 return DAG.getBitcast(VT, Res);
42268 }
42269 break;
42270 }
42271
42272 return SDValue();
42273}
42274
42275/// Try to combine x86 target specific shuffles.
42277 SelectionDAG &DAG,
42279 const X86Subtarget &Subtarget) {
42280 using namespace SDPatternMatch;
42281
42282 MVT VT = N.getSimpleValueType();
42283 unsigned NumElts = VT.getVectorNumElements();
42285 unsigned Opcode = N.getOpcode();
42286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42287
42288 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42289 return R;
42290
42291 // Handle specific target shuffles.
42292 switch (Opcode) {
42293 case X86ISD::MOVDDUP: {
42294 SDValue Src = N.getOperand(0);
42295 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42296 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42297 ISD::isNormalLoad(Src.getNode())) {
42298 LoadSDNode *LN = cast<LoadSDNode>(Src);
42299 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42300 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42301 DCI.CombineTo(N.getNode(), Movddup);
42302 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42304 return N; // Return N so it doesn't get rechecked!
42305 }
42306 }
42307
42308 return SDValue();
42309 }
42310 case X86ISD::VBROADCAST: {
42311 SDValue Src = N.getOperand(0);
42312 SDValue BC = peekThroughBitcasts(Src);
42313 EVT SrcVT = Src.getValueType();
42314 EVT BCVT = BC.getValueType();
42315
42316 // If broadcasting from another shuffle, attempt to simplify it.
42317 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42318 if (isTargetShuffle(BC.getOpcode()) &&
42319 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42320 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42321 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42323 for (unsigned i = 0; i != Scale; ++i)
42324 DemandedMask[i] = i;
42326 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42327 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42328 /*AllowVariableCrossLaneMask=*/true,
42329 /*AllowVariablePerLaneMask=*/true,
42330 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42331 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42332 DAG.getBitcast(SrcVT, Res));
42333 }
42334
42335 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42336 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42337 if (Src.getOpcode() == ISD::BITCAST &&
42338 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42339 TLI.isTypeLegal(BCVT) &&
42341 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42342 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42344 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42345 }
42346
42347 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42348 // If we're re-broadcasting a smaller type then broadcast with that type and
42349 // bitcast.
42350 // TODO: Do this for any splat?
42351 if (Src.getOpcode() == ISD::BITCAST &&
42352 (BC.getOpcode() == X86ISD::VBROADCAST ||
42354 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42355 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42356 MVT NewVT =
42358 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42359 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42360 }
42361
42362 // Reduce broadcast source vector to lowest 128-bits.
42363 if (SrcVT.getSizeInBits() > 128)
42364 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42365 extract128BitVector(Src, 0, DAG, DL));
42366
42367 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42368 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42369 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42370 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42371
42372 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42373 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42374 isNullConstant(Src.getOperand(1)) &&
42375 Src.getValueType() ==
42376 Src.getOperand(0).getValueType().getScalarType() &&
42377 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42378 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42379
42380 // Share broadcast with the longest vector and extract low subvector (free).
42381 // Ensure the same SDValue from the SDNode use is being used.
42382 for (SDNode *User : Src->users())
42383 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42384 Src == User->getOperand(0) &&
42385 User->getValueSizeInBits(0).getFixedValue() >
42386 VT.getFixedSizeInBits()) {
42387 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42388 VT.getSizeInBits());
42389 }
42390
42391 // vbroadcast(scalarload X) -> vbroadcast_load X
42392 // For float loads, extract other uses of the scalar from the broadcast.
42393 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42394 ISD::isNormalLoad(Src.getNode())) {
42395 LoadSDNode *LN = cast<LoadSDNode>(Src);
42396 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42397 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42398 SDValue BcastLd =
42400 LN->getMemoryVT(), LN->getMemOperand());
42401 // If the load value is used only by N, replace it via CombineTo N.
42402 bool NoReplaceExtract = Src.hasOneUse();
42403 DCI.CombineTo(N.getNode(), BcastLd);
42404 if (NoReplaceExtract) {
42405 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42407 } else {
42408 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42409 DAG.getVectorIdxConstant(0, DL));
42410 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42411 }
42412 return N; // Return N so it doesn't get rechecked!
42413 }
42414
42415 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42416 // i16. So shrink it ourselves if we can make a broadcast_load.
42417 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42418 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42419 assert(Subtarget.hasAVX2() && "Expected AVX2");
42420 SDValue TruncIn = Src.getOperand(0);
42421
42422 // If this is a truncate of a non extending load we can just narrow it to
42423 // use a broadcast_load.
42424 if (ISD::isNormalLoad(TruncIn.getNode())) {
42425 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42426 // Unless its volatile or atomic.
42427 if (LN->isSimple()) {
42428 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42429 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42430 SDValue BcastLd = DAG.getMemIntrinsicNode(
42431 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42432 LN->getPointerInfo(), LN->getBaseAlign(),
42433 LN->getMemOperand()->getFlags());
42434 DCI.CombineTo(N.getNode(), BcastLd);
42435 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42436 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42437 return N; // Return N so it doesn't get rechecked!
42438 }
42439 }
42440
42441 // If this is a truncate of an i16 extload, we can directly replace it.
42442 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42443 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42444 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42445 if (LN->getMemoryVT().getSizeInBits() == 16) {
42446 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42447 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42448 SDValue BcastLd =
42450 LN->getMemoryVT(), LN->getMemOperand());
42451 DCI.CombineTo(N.getNode(), BcastLd);
42452 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42453 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42454 return N; // Return N so it doesn't get rechecked!
42455 }
42456 }
42457
42458 // If this is a truncate of load that has been shifted right, we can
42459 // offset the pointer and use a narrower load.
42460 if (TruncIn.getOpcode() == ISD::SRL &&
42461 TruncIn.getOperand(0).hasOneUse() &&
42462 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42463 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42464 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42465 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42466 // Make sure the shift amount and the load size are divisible by 16.
42467 // Don't do this if the load is volatile or atomic.
42468 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42469 LN->isSimple()) {
42470 unsigned Offset = ShiftAmt / 8;
42471 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42474 SDValue Ops[] = { LN->getChain(), Ptr };
42475 SDValue BcastLd = DAG.getMemIntrinsicNode(
42476 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42478 LN->getMemOperand()->getFlags());
42479 DCI.CombineTo(N.getNode(), BcastLd);
42480 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42481 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42482 return N; // Return N so it doesn't get rechecked!
42483 }
42484 }
42485 }
42486
42487 // vbroadcast(vzload X) -> vbroadcast_load X
42488 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42490 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42491 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42492 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42493 SDValue BcastLd =
42495 LN->getMemoryVT(), LN->getMemOperand());
42496 DCI.CombineTo(N.getNode(), BcastLd);
42497 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42499 return N; // Return N so it doesn't get rechecked!
42500 }
42501 }
42502
42503 // vbroadcast(vector load X) -> vbroadcast_load
42504 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42505 LoadSDNode *LN = cast<LoadSDNode>(Src);
42506 // Unless the load is volatile or atomic.
42507 if (LN->isSimple()) {
42508 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42509 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42510 SDValue BcastLd = DAG.getMemIntrinsicNode(
42512 LN->getPointerInfo(), LN->getBaseAlign(),
42513 LN->getMemOperand()->getFlags());
42514 DCI.CombineTo(N.getNode(), BcastLd);
42515 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42517 return N; // Return N so it doesn't get rechecked!
42518 }
42519 }
42520
42521 return SDValue();
42522 }
42523 case X86ISD::VZEXT_MOVL: {
42524 SDValue N0 = N.getOperand(0);
42525
42526 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42527 // Zeroing out the upper elements means we're just shifting a zero value.
42528 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42529 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42530 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42531 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42532 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42533 if (N0.hasOneUse())
42534 return DAG.getNode(
42535 N0.getOpcode(), DL, VT,
42536 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42537 N0.getOperand(1));
42538 }
42539
42540 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42541 // the load is volatile.
42542 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42543 auto *LN = cast<LoadSDNode>(N0);
42544 if (SDValue VZLoad =
42545 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42546 DCI.CombineTo(N.getNode(), VZLoad);
42547 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42549 return N;
42550 }
42551 }
42552
42553 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42554 // and can just use a VZEXT_LOAD.
42555 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42556 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42557 auto *LN = cast<MemSDNode>(N0);
42558 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42559 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42560 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42561 SDValue VZLoad =
42563 LN->getMemoryVT(), LN->getMemOperand());
42564 DCI.CombineTo(N.getNode(), VZLoad);
42565 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42567 return N;
42568 }
42569 }
42570
42571 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42572 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42573 // if the upper bits of the i64 are zero.
42574 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42575 N0.getOperand(0).hasOneUse() &&
42576 N0.getOperand(0).getValueType() == MVT::i64) {
42577 SDValue In = N0.getOperand(0);
42578 APInt Mask = APInt::getHighBitsSet(64, 32);
42579 if (DAG.MaskedValueIsZero(In, Mask)) {
42580 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42581 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42582 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42583 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42584 return DAG.getBitcast(VT, Movl);
42585 }
42586 }
42587
42588 // Load a scalar integer constant directly to XMM instead of transferring an
42589 // immediate value from GPR.
42590 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42591 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42592 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42593 // Create a vector constant - scalar constant followed by zeros.
42594 EVT ScalarVT = N0.getOperand(0).getValueType();
42595 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42596 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42597 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42598 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42599
42600 // Load the vector constant from constant pool.
42601 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42602 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42603 MachinePointerInfo MPI =
42605 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42606 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42608 }
42609 }
42610
42611 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42612 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42613 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42614 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42615 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42617
42618 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42619 isNullConstant(V.getOperand(2))) {
42620 SDValue In = V.getOperand(1);
42622 In.getValueSizeInBits() /
42623 VT.getScalarSizeInBits());
42624 In = DAG.getBitcast(SubVT, In);
42625 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42626 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42627 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42628 V.getOperand(2));
42629 }
42630 }
42631
42632 return SDValue();
42633 }
42634 case X86ISD::BLENDI: {
42635 SDValue N0 = N.getOperand(0);
42636 SDValue N1 = N.getOperand(1);
42637 unsigned EltBits = VT.getScalarSizeInBits();
42638
42639 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42640 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42641 // TODO: Handle MVT::v16i16 repeated blend mask.
42642 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42643 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42644 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42645 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42646 unsigned NewSize = SrcVT.getVectorNumElements();
42647 APInt BlendMask = getBLENDIBlendMask(N);
42648 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42649 return DAG.getBitcast(
42650 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42651 N1.getOperand(0),
42652 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42653 DL, MVT::i8)));
42654 }
42655 }
42656 // Share PSHUFB masks:
42657 // blend(pshufb(x,m1),pshufb(y,m2))
42658 // --> m3 = blend(m1,m2)
42659 // blend(pshufb(x,m3),pshufb(y,m3))
42660 if (N0.hasOneUse() && N1.hasOneUse()) {
42661 SmallVector<int> Mask, ByteMask;
42665 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42666 RHS.getOpcode() == X86ISD::PSHUFB &&
42667 LHS.getOperand(1) != RHS.getOperand(1) &&
42668 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42669 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42670 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42672 "BLENDI decode mismatch");
42673 MVT ShufVT = LHS.getSimpleValueType();
42674 SDValue MaskLHS = LHS.getOperand(1);
42675 SDValue MaskRHS = RHS.getOperand(1);
42676 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42678 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42679 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42680 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42681 LHS.getOperand(0), NewMask);
42682 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42683 RHS.getOperand(0), NewMask);
42684 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42685 DAG.getBitcast(VT, NewLHS),
42686 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42687 }
42688 }
42689 }
42690 }
42691 return SDValue();
42692 }
42693 case X86ISD::SHUFP: {
42694 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42695 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42696 // TODO: Support types other than v4f32.
42697 if (VT == MVT::v4f32) {
42698 bool Updated = false;
42699 SmallVector<int> Mask;
42701 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42702 for (int i = 0; i != 2; ++i) {
42703 SmallVector<SDValue> SubOps;
42704 SmallVector<int> SubMask, SubScaledMask;
42706 // TODO: Scaling might be easier if we specify the demanded elts.
42707 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42708 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42709 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42710 int Ofs = i * 2;
42711 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42712 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42713 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42714 Updated = true;
42715 }
42716 }
42717 }
42718 if (Updated) {
42719 for (int &M : Mask)
42720 M %= 4;
42721 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42722 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42723 }
42724 }
42725 return SDValue();
42726 }
42727 case X86ISD::VPERMI: {
42728 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42729 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42730 SDValue N0 = N.getOperand(0);
42731 SDValue N1 = N.getOperand(1);
42732 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42733 if (N0.getOpcode() == ISD::BITCAST &&
42734 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42735 SDValue Src = N0.getOperand(0);
42736 EVT SrcVT = Src.getValueType();
42737 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42738 return DAG.getBitcast(VT, Res);
42739 }
42740 return SDValue();
42741 }
42742 case X86ISD::SHUF128: {
42743 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42744 // see if we can peek through and access the subvector directly.
42745 if (VT.is512BitVector()) {
42746 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42747 // the upper subvector is used.
42748 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42749 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42750 uint64_t Mask = N->getConstantOperandVal(2);
42751 SmallVector<SDValue> LHSOps, RHSOps;
42752 SDValue NewLHS, NewRHS;
42753 if ((Mask & 0x0A) == 0x0A &&
42754 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42755 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42756 Mask &= ~0x0A;
42757 }
42758 if ((Mask & 0xA0) == 0xA0 &&
42759 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42760 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42761 Mask &= ~0xA0;
42762 }
42763 if (NewLHS || NewRHS)
42764 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42765 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42766 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42767 DAG.getTargetConstant(Mask, DL, MVT::i8));
42768 }
42769 return SDValue();
42770 }
42771 case X86ISD::VPERM2X128: {
42772 SDValue LHS = N->getOperand(0);
42773 SDValue RHS = N->getOperand(1);
42774 unsigned Imm = N.getConstantOperandVal(2) & 255;
42775
42776 // Canonicalize unary/repeated operands to LHS.
42777 if (LHS.isUndef() && !RHS.isUndef())
42778 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42779 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42780 if (LHS == RHS)
42781 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42782 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42783
42784 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42785 if (LHS.getOpcode() == ISD::BITCAST &&
42786 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42787 EVT SrcVT = LHS.getOperand(0).getValueType();
42788 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42789 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42790 DAG.getBitcast(SrcVT, LHS),
42791 DAG.getBitcast(SrcVT, RHS),
42792 N->getOperand(2)));
42793 }
42794 }
42795
42796 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42798 return Res;
42799
42800 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42801 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42802 auto FindSubVector128 = [&](unsigned Idx) {
42803 if (Idx > 3)
42804 return SDValue();
42805 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42806 SmallVector<SDValue> SubOps;
42807 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42808 return SubOps[Idx & 1];
42809 unsigned NumElts = Src.getValueType().getVectorNumElements();
42810 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42811 Src.getOperand(1).getValueSizeInBits() == 128 &&
42812 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42813 return Src.getOperand(1);
42814 }
42815 return SDValue();
42816 };
42817 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42818 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42819 MVT SubVT = VT.getHalfNumVectorElementsVT();
42820 SubLo = DAG.getBitcast(SubVT, SubLo);
42821 SubHi = DAG.getBitcast(SubVT, SubHi);
42822 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42823 }
42824 }
42825
42826 // Attempt to match VBROADCAST*128 subvector broadcast load.
42827 if (RHS.isUndef()) {
42829 DecodeVPERM2X128Mask(4, Imm, Mask);
42830 if (isUndefOrInRange(Mask, 0, 4)) {
42831 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42832 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42833 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42834 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42835 MVT MemVT = VT.getHalfNumVectorElementsVT();
42836 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42838 cast<LoadSDNode>(LHS), Ofs, DAG);
42839 }
42840 }
42841 }
42842
42843 return SDValue();
42844 }
42845 case X86ISD::PSHUFD:
42846 case X86ISD::PSHUFLW:
42847 case X86ISD::PSHUFHW: {
42848 SDValue N0 = N.getOperand(0);
42849 SDValue N1 = N.getOperand(1);
42850 if (N0->hasOneUse()) {
42852 switch (V.getOpcode()) {
42853 case X86ISD::VSHL:
42854 case X86ISD::VSRL:
42855 case X86ISD::VSRA:
42856 case X86ISD::VSHLI:
42857 case X86ISD::VSRLI:
42858 case X86ISD::VSRAI:
42859 case X86ISD::VROTLI:
42860 case X86ISD::VROTRI: {
42861 MVT InnerVT = V.getSimpleValueType();
42862 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42863 SDValue Res = DAG.getNode(Opcode, DL, VT,
42864 DAG.getBitcast(VT, V.getOperand(0)), N1);
42865 Res = DAG.getBitcast(InnerVT, Res);
42866 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42867 return DAG.getBitcast(VT, Res);
42868 }
42869 break;
42870 }
42871 }
42872 }
42873
42874 Mask = getPSHUFShuffleMask(N);
42875 assert(Mask.size() == 4);
42876 break;
42877 }
42878 case X86ISD::MOVSD:
42879 case X86ISD::MOVSH:
42880 case X86ISD::MOVSS: {
42881 SDValue N0 = N.getOperand(0);
42882 SDValue N1 = N.getOperand(1);
42883
42884 // Canonicalize scalar FPOps:
42885 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42886 // If commutable, allow OP(N1[0], N0[0]).
42887 unsigned Opcode1 = N1.getOpcode();
42888 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42889 Opcode1 == ISD::FDIV) {
42890 SDValue N10 = N1.getOperand(0);
42891 SDValue N11 = N1.getOperand(1);
42892 if (N10 == N0 ||
42893 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42894 if (N10 != N0)
42895 std::swap(N10, N11);
42896 MVT SVT = VT.getVectorElementType();
42897 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42898 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42899 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42900 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42901 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42902 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42903 }
42904 }
42905
42906 return SDValue();
42907 }
42908 case X86ISD::INSERTPS: {
42909 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42910 SDValue Op0 = N.getOperand(0);
42911 SDValue Op1 = N.getOperand(1);
42912 unsigned InsertPSMask = N.getConstantOperandVal(2);
42913 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42914 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42915 unsigned ZeroMask = InsertPSMask & 0xF;
42916
42917 // If we zero out all elements from Op0 then we don't need to reference it.
42918 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42919 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42920 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42921
42922 // If we zero out the element from Op1 then we don't need to reference it.
42923 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42924 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42925 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42926
42927 // Attempt to merge insertps Op1 with an inner target shuffle node.
42928 SmallVector<int, 8> TargetMask1;
42930 APInt KnownUndef1, KnownZero1;
42931 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42932 KnownZero1)) {
42933 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42934 // Zero/UNDEF insertion - zero out element and remove dependency.
42935 InsertPSMask |= (1u << DstIdx);
42936 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42937 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42938 }
42939 // Update insertps mask srcidx and reference the source input directly.
42940 int M = TargetMask1[SrcIdx];
42941 assert(0 <= M && M < 8 && "Shuffle index out of range");
42942 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42943 Op1 = Ops1[M < 4 ? 0 : 1];
42944 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42945 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42946 }
42947
42948 // Attempt to merge insertps Op0 with an inner target shuffle node.
42949 SmallVector<int, 8> TargetMask0;
42951 APInt KnownUndef0, KnownZero0;
42952 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42953 KnownZero0)) {
42954 bool Updated = false;
42955 bool UseInput00 = false;
42956 bool UseInput01 = false;
42957 for (int i = 0; i != 4; ++i) {
42958 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42959 // No change if element is already zero or the inserted element.
42960 continue;
42961 }
42962
42963 if (KnownUndef0[i] || KnownZero0[i]) {
42964 // If the target mask is undef/zero then we must zero the element.
42965 InsertPSMask |= (1u << i);
42966 Updated = true;
42967 continue;
42968 }
42969
42970 // The input vector element must be inline.
42971 int M = TargetMask0[i];
42972 if (M != i && M != (i + 4))
42973 return SDValue();
42974
42975 // Determine which inputs of the target shuffle we're using.
42976 UseInput00 |= (0 <= M && M < 4);
42977 UseInput01 |= (4 <= M);
42978 }
42979
42980 // If we're not using both inputs of the target shuffle then use the
42981 // referenced input directly.
42982 if (UseInput00 && !UseInput01) {
42983 Updated = true;
42984 Op0 = Ops0[0];
42985 } else if (!UseInput00 && UseInput01) {
42986 Updated = true;
42987 Op0 = Ops0[1];
42988 }
42989
42990 if (Updated)
42991 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42992 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42993 }
42994
42995 // If we're inserting an element from a vbroadcast load, fold the
42996 // load into the X86insertps instruction. We need to convert the scalar
42997 // load to a vector and clear the source lane of the INSERTPS control.
42998 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42999 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
43000 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
43001 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
43002 MemIntr->getBasePtr(),
43003 MemIntr->getMemOperand());
43004 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
43006 Load),
43007 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
43008 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
43009 return Insert;
43010 }
43011 }
43012
43013 return SDValue();
43014 }
43015 case X86ISD::VPERMV: {
43016 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43018 SmallVector<SDValue, 2> SrcOps, SubOps;
43019 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43020 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43021 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43022 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43023 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43024 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43025 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43026 "Unexpected split ops");
43027 // Bail if we were permuting a widened vector.
43028 if (SubOps[1].isUndef() &&
43029 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43030 return SDValue();
43031 // Bail if any subops would have folded into the concat.
43032 if (any_of(SubOps, isShuffleFoldableLoad))
43033 return SDValue();
43034 // Concat 4x128 back to 2x256.
43035 if (SubOps.size() == 4) {
43036 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43037 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43038 }
43039 // Convert mask to 2 operand shuffle.
43040 int HalfElts = NumElts / 2;
43041 for (int &M : Mask)
43042 M += M >= HalfElts ? HalfElts : 0;
43043 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43044 VT.getSizeInBits());
43045 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43046 VT.getSizeInBits());
43047 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43048 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43049 }
43050 return SDValue();
43051 }
43052 case X86ISD::VPERMV3: {
43053 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43054 bool CanConcat = VT.is128BitVector() ||
43055 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43058 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43059 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43060 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43061 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43062 // Canonicalize to VPERMV if both sources are the same.
43063 if (V1 == V2) {
43064 for (int &M : Mask)
43065 M = (M < 0 ? M : (M & (NumElts - 1)));
43066 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43067 DAG.getUNDEF(VT), Subtarget, DAG);
43068 }
43069 // If sources are half width, then concat and use VPERMV with adjusted
43070 // mask.
43071 SDValue Ops[2];
43072 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43073 if (sd_match(V1,
43075 sd_match(V2,
43077 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43078 if (SDValue ConcatSrc =
43079 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43080 for (int &M : Mask)
43081 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43082 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43083 DAG.getUNDEF(VT), Subtarget, DAG);
43084 }
43085 }
43086 // Commute foldable source to the RHS.
43087 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43088 !isShuffleFoldableLoad(N.getOperand(2))) {
43090 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43091 N.getOperand(0), Subtarget, DAG);
43092 }
43093 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43094 // freely concatenated, with a commuted shuffle mask.
43095 if (CanConcat) {
43096 if (SDValue ConcatSrc = combineConcatVectorOps(
43097 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43098 Subtarget)) {
43100 Mask.append(NumElts, SM_SentinelUndef);
43101 SDValue Perm =
43102 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43103 DAG.getUNDEF(WideVT), Subtarget, DAG);
43104 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43105 DAG.getVectorIdxConstant(0, DL));
43106 }
43107 }
43108 }
43109 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43110 // freely concatenated.
43111 if (CanConcat) {
43112 if (SDValue ConcatSrc = combineConcatVectorOps(
43113 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43114 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43115 DL, WideVT.getSizeInBits());
43116 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43117 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43118 DAG.getVectorIdxConstant(0, DL));
43119 }
43120 }
43121 return SDValue();
43122 }
43123 default:
43124 return SDValue();
43125 }
43126
43127 // Nuke no-op shuffles that show up after combining.
43128 if (isNoopShuffleMask(Mask))
43129 return N.getOperand(0);
43130
43131 // Look for simplifications involving one or two shuffle instructions.
43132 SDValue V = N.getOperand(0);
43133 switch (N.getOpcode()) {
43134 default:
43135 break;
43136 case X86ISD::PSHUFLW:
43137 case X86ISD::PSHUFHW:
43138 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43139
43140 // See if this reduces to a PSHUFD which is no more expensive and can
43141 // combine with more operations. Note that it has to at least flip the
43142 // dwords as otherwise it would have been removed as a no-op.
43143 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43144 int DMask[] = {0, 1, 2, 3};
43145 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43146 DMask[DOffset + 0] = DOffset + 1;
43147 DMask[DOffset + 1] = DOffset + 0;
43148 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43149 V = DAG.getBitcast(DVT, V);
43150 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43151 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43152 return DAG.getBitcast(VT, V);
43153 }
43154
43155 // Look for shuffle patterns which can be implemented as a single unpack.
43156 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43157 // only works when we have a PSHUFD followed by two half-shuffles.
43158 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43159 (V.getOpcode() == X86ISD::PSHUFLW ||
43160 V.getOpcode() == X86ISD::PSHUFHW) &&
43161 V.getOpcode() != N.getOpcode() &&
43162 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43163 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43164 if (D.getOpcode() == X86ISD::PSHUFD) {
43167 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43168 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43169 int WordMask[8];
43170 for (int i = 0; i < 4; ++i) {
43171 WordMask[i + NOffset] = Mask[i] + NOffset;
43172 WordMask[i + VOffset] = VMask[i] + VOffset;
43173 }
43174 // Map the word mask through the DWord mask.
43175 int MappedMask[8];
43176 for (int i = 0; i < 8; ++i)
43177 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43178 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43179 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43180 // We can replace all three shuffles with an unpack.
43181 V = DAG.getBitcast(VT, D.getOperand(0));
43182 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43184 DL, VT, V, V);
43185 }
43186 }
43187 }
43188
43189 break;
43190
43191 case X86ISD::PSHUFD:
43192 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43193 return NewN;
43194
43195 break;
43196 }
43197
43198 return SDValue();
43199}
43200
43201/// Checks if the shuffle mask takes subsequent elements
43202/// alternately from two vectors.
43203/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43204static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43205
43206 int ParitySrc[2] = {-1, -1};
43207 unsigned Size = Mask.size();
43208 for (unsigned i = 0; i != Size; ++i) {
43209 int M = Mask[i];
43210 if (M < 0)
43211 continue;
43212
43213 // Make sure we are using the matching element from the input.
43214 if ((M % Size) != i)
43215 return false;
43216
43217 // Make sure we use the same input for all elements of the same parity.
43218 int Src = M / Size;
43219 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43220 return false;
43221 ParitySrc[i % 2] = Src;
43222 }
43223
43224 // Make sure each input is used.
43225 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43226 return false;
43227
43228 Op0Even = ParitySrc[0] == 0;
43229 return true;
43230}
43231
43232/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43233/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43234/// are written to the parameters \p Opnd0 and \p Opnd1.
43235///
43236/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43237/// so it is easier to generically match. We also insert dummy vector shuffle
43238/// nodes for the operands which explicitly discard the lanes which are unused
43239/// by this operation to try to flow through the rest of the combiner
43240/// the fact that they're unused.
43241static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43242 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43243 bool &IsSubAdd, bool &HasAllowContract) {
43244
43245 EVT VT = N->getValueType(0);
43246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43247 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43249 return false;
43250
43251 // We only handle target-independent shuffles.
43252 // FIXME: It would be easy and harmless to use the target shuffle mask
43253 // extraction tool to support more.
43254 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43255 return false;
43256
43257 SDValue V1 = N->getOperand(0);
43258 SDValue V2 = N->getOperand(1);
43259
43260 // Make sure we have an FADD and an FSUB.
43261 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43262 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43263 V1.getOpcode() == V2.getOpcode())
43264 return false;
43265
43266 // If there are other uses of these operations we can't fold them.
43267 if (!V1->hasOneUse() || !V2->hasOneUse())
43268 return false;
43269
43270 // Ensure that both operations have the same operands. Note that we can
43271 // commute the FADD operands.
43272 SDValue LHS, RHS;
43273 if (V1.getOpcode() == ISD::FSUB) {
43274 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43275 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43276 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43277 return false;
43278 } else {
43279 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43280 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43281 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43282 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43283 return false;
43284 }
43285
43286 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43287 bool Op0Even;
43288 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43289 return false;
43290
43291 // It's a subadd if the vector in the even parity is an FADD.
43292 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43293 : V2->getOpcode() == ISD::FADD;
43294 HasAllowContract =
43296
43297 Opnd0 = LHS;
43298 Opnd1 = RHS;
43299 return true;
43300}
43301
43302/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43304 const X86Subtarget &Subtarget,
43305 SelectionDAG &DAG) {
43306 // We only handle target-independent shuffles.
43307 // FIXME: It would be easy and harmless to use the target shuffle mask
43308 // extraction tool to support more.
43309 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43310 return SDValue();
43311
43312 MVT VT = N->getSimpleValueType(0);
43313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43314 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43315 return SDValue();
43316
43317 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43318 SDValue Op0 = N->getOperand(0);
43319 SDValue Op1 = N->getOperand(1);
43320 SDValue FMAdd = Op0, FMSub = Op1;
43321 if (FMSub.getOpcode() != X86ISD::FMSUB)
43322 std::swap(FMAdd, FMSub);
43323
43324 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43325 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43326 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43327 FMAdd.getOperand(2) != FMSub.getOperand(2))
43328 return SDValue();
43329
43330 // Check for correct shuffle mask.
43331 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43332 bool Op0Even;
43333 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43334 return SDValue();
43335
43336 // FMAddSub takes zeroth operand from FMSub node.
43337 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43338 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43339 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43340 FMAdd.getOperand(2));
43341}
43342
43343/// Try to combine a shuffle into a target-specific add-sub or
43344/// mul-add-sub node.
43346 const X86Subtarget &Subtarget,
43347 SelectionDAG &DAG) {
43348 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43349 return V;
43350
43351 SDValue Opnd0, Opnd1;
43352 bool IsSubAdd;
43353 bool HasAllowContract;
43354 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43355 HasAllowContract))
43356 return SDValue();
43357
43358 MVT VT = N->getSimpleValueType(0);
43359
43360 // Try to generate X86ISD::FMADDSUB node here.
43361 SDValue Opnd2;
43362 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43363 HasAllowContract)) {
43364 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43365 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43366 }
43367
43368 if (IsSubAdd)
43369 return SDValue();
43370
43371 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43372 // the ADDSUB idiom has been successfully recognized. There are no known
43373 // X86 targets with 512-bit ADDSUB instructions!
43374 if (VT.is512BitVector())
43375 return SDValue();
43376
43377 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43378 // the ADDSUB idiom has been successfully recognized. There are no known
43379 // X86 targets with FP16 ADDSUB instructions!
43380 if (VT.getVectorElementType() == MVT::f16)
43381 return SDValue();
43382
43383 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43384}
43385
43386/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43387/// low half of each source vector and does not set any high half elements in
43388/// the destination vector, narrow the shuffle to half its original size.
43390 EVT VT = Shuf->getValueType(0);
43391 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43392 return SDValue();
43393 if (!VT.is256BitVector() && !VT.is512BitVector())
43394 return SDValue();
43395
43396 // See if we can ignore all of the high elements of the shuffle.
43397 ArrayRef<int> Mask = Shuf->getMask();
43398 if (!isUndefUpperHalf(Mask))
43399 return SDValue();
43400
43401 // Check if the shuffle mask accesses only the low half of each input vector
43402 // (half-index output is 0 or 2).
43403 int HalfIdx1, HalfIdx2;
43404 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43405 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43406 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43407 return SDValue();
43408
43409 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43410 // The trick is knowing that all of the insert/extract are actually free
43411 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43412 // of narrow inputs into a narrow output, and that is always cheaper than
43413 // the wide shuffle that we started with.
43414 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43415 Shuf->getOperand(1), HalfMask, HalfIdx1,
43416 HalfIdx2, false, DAG, /*UseConcat*/ true);
43417}
43418
43421 const X86Subtarget &Subtarget) {
43422 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43423 if (SDValue V = narrowShuffle(Shuf, DAG))
43424 return V;
43425
43426 // If we have legalized the vector types, look for blends of FADD and FSUB
43427 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43428 SDLoc dl(N);
43429 EVT VT = N->getValueType(0);
43430 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43431 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43432 if (SDValue AddSub =
43433 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43434 return AddSub;
43435
43436 // Attempt to combine into a vector load/broadcast.
43438 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43439 return LD;
43440
43441 if (isTargetShuffle(N->getOpcode())) {
43442 SDValue Op(N, 0);
43443 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43444 return Shuffle;
43445
43446 // Try recursively combining arbitrary sequences of x86 shuffle
43447 // instructions into higher-order shuffles. We do this after combining
43448 // specific PSHUF instruction sequences into their minimal form so that we
43449 // can evaluate how many specialized shuffle instructions are involved in
43450 // a particular chain.
43451 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43452 return Res;
43453
43454 // Simplify source operands based on shuffle mask.
43455 // TODO - merge this into combineX86ShufflesRecursively.
43456 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43457 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43458 return SDValue(N, 0);
43459
43460 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43461 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43462 // Perform this after other shuffle combines to allow inner shuffles to be
43463 // combined away first.
43464 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43465 return BinOp;
43466 }
43467
43468 return SDValue();
43469}
43470
43471// Simplify variable target shuffle masks based on the demanded elements.
43472// TODO: Handle DemandedBits in mask indices as well?
43474 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43475 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43476 // If we're demanding all elements don't bother trying to simplify the mask.
43477 unsigned NumElts = DemandedElts.getBitWidth();
43478 if (DemandedElts.isAllOnes())
43479 return false;
43480
43481 SDValue Mask = Op.getOperand(MaskIndex);
43482 if (!Mask.hasOneUse())
43483 return false;
43484
43485 // Attempt to generically simplify the variable shuffle mask.
43486 APInt MaskUndef, MaskZero;
43487 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43488 Depth + 1))
43489 return true;
43490
43491 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43492 // TODO: Support other types from getTargetShuffleMaskIndices?
43494 EVT BCVT = BC.getValueType();
43495 auto *Load = dyn_cast<LoadSDNode>(BC);
43496 if (!Load || !Load->getBasePtr().hasOneUse())
43497 return false;
43498
43499 const Constant *C = getTargetConstantFromNode(Load);
43500 if (!C)
43501 return false;
43502
43503 Type *CTy = C->getType();
43504 if (!CTy->isVectorTy() ||
43505 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43506 return false;
43507
43508 // Handle scaling for i64 elements on 32-bit targets.
43509 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43510 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43511 return false;
43512 unsigned Scale = NumCstElts / NumElts;
43513
43514 // Simplify mask if we have an undemanded element that is not undef.
43515 bool Simplified = false;
43516 SmallVector<Constant *, 32> ConstVecOps;
43517 for (unsigned i = 0; i != NumCstElts; ++i) {
43518 Constant *Elt = C->getAggregateElement(i);
43519 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43520 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43521 Simplified = true;
43522 continue;
43523 }
43524 ConstVecOps.push_back(Elt);
43525 }
43526 if (!Simplified)
43527 return false;
43528
43529 // Generate new constant pool entry + legalize immediately for the load.
43530 SDLoc DL(Op);
43531 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43532 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43533 SDValue NewMask = TLO.DAG.getLoad(
43534 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43536 Load->getAlign());
43537 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43538}
43539
43541 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43542 TargetLoweringOpt &TLO, unsigned Depth) const {
43543 int NumElts = DemandedElts.getBitWidth();
43544 unsigned Opc = Op.getOpcode();
43545 EVT VT = Op.getValueType();
43546
43547 // Handle special case opcodes.
43548 switch (Opc) {
43549 case X86ISD::PMULDQ:
43550 case X86ISD::PMULUDQ: {
43551 APInt LHSUndef, LHSZero;
43552 APInt RHSUndef, RHSZero;
43553 SDValue LHS = Op.getOperand(0);
43554 SDValue RHS = Op.getOperand(1);
43555 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43556 Depth + 1))
43557 return true;
43558 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43559 Depth + 1))
43560 return true;
43561 // Multiply by zero.
43562 KnownZero = LHSZero | RHSZero;
43563 break;
43564 }
43565 case X86ISD::VPMADDUBSW:
43566 case X86ISD::VPMADDWD: {
43567 APInt LHSUndef, LHSZero;
43568 APInt RHSUndef, RHSZero;
43569 SDValue LHS = Op.getOperand(0);
43570 SDValue RHS = Op.getOperand(1);
43571 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43572
43573 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43574 Depth + 1))
43575 return true;
43576 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43577 Depth + 1))
43578 return true;
43579
43580 // TODO: Multiply by zero.
43581
43582 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43583 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43584 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43585 Depth + 1))
43586 return true;
43587 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43588 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43589 Depth + 1))
43590 return true;
43591 break;
43592 }
43593 case X86ISD::PSADBW: {
43594 SDValue LHS = Op.getOperand(0);
43595 SDValue RHS = Op.getOperand(1);
43596 assert(VT.getScalarType() == MVT::i64 &&
43597 LHS.getValueType() == RHS.getValueType() &&
43598 LHS.getValueType().getScalarType() == MVT::i8 &&
43599 "Unexpected PSADBW types");
43600
43601 // Aggressively peek through ops to get at the demanded elts.
43602 if (!DemandedElts.isAllOnes()) {
43603 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43604 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43606 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43608 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43609 if (NewLHS || NewRHS) {
43610 NewLHS = NewLHS ? NewLHS : LHS;
43611 NewRHS = NewRHS ? NewRHS : RHS;
43612 return TLO.CombineTo(
43613 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43614 }
43615 }
43616 break;
43617 }
43618 case X86ISD::VSHL:
43619 case X86ISD::VSRL:
43620 case X86ISD::VSRA: {
43621 // We only need the bottom 64-bits of the (128-bit) shift amount.
43622 SDValue Amt = Op.getOperand(1);
43623 MVT AmtVT = Amt.getSimpleValueType();
43624 assert(AmtVT.is128BitVector() && "Unexpected value type");
43625
43626 // If we reuse the shift amount just for sse shift amounts then we know that
43627 // only the bottom 64-bits are only ever used.
43628 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43629 unsigned UseOpc = Use->getOpcode();
43630 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43631 UseOpc == X86ISD::VSRA) &&
43632 Use->getOperand(0) != Amt;
43633 });
43634
43635 APInt AmtUndef, AmtZero;
43636 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43637 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43638 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43639 Depth + 1, AssumeSingleUse))
43640 return true;
43641 [[fallthrough]];
43642 }
43643 case X86ISD::VSHLI:
43644 case X86ISD::VSRLI:
43645 case X86ISD::VSRAI: {
43646 SDValue Src = Op.getOperand(0);
43647 APInt SrcUndef;
43648 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43649 Depth + 1))
43650 return true;
43651
43652 // Fold shift(0,x) -> 0
43653 if (DemandedElts.isSubsetOf(KnownZero))
43654 return TLO.CombineTo(
43655 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43656
43657 // Aggressively peek through ops to get at the demanded elts.
43658 if (!DemandedElts.isAllOnes())
43660 Src, DemandedElts, TLO.DAG, Depth + 1))
43661 return TLO.CombineTo(
43662 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43663 break;
43664 }
43665 case X86ISD::VPSHA:
43666 case X86ISD::VPSHL:
43667 case X86ISD::VSHLV:
43668 case X86ISD::VSRLV:
43669 case X86ISD::VSRAV: {
43670 APInt LHSUndef, LHSZero;
43671 APInt RHSUndef, RHSZero;
43672 SDValue LHS = Op.getOperand(0);
43673 SDValue RHS = Op.getOperand(1);
43674 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43675 Depth + 1))
43676 return true;
43677
43678 // Fold shift(0,x) -> 0
43679 if (DemandedElts.isSubsetOf(LHSZero))
43680 return TLO.CombineTo(
43681 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43682
43683 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43684 Depth + 1))
43685 return true;
43686
43687 KnownZero = LHSZero;
43688 break;
43689 }
43690 case X86ISD::CMPM:
43691 case X86ISD::CMPP: {
43692 // Scalarize packed fp comparison if we only require element 0.
43693 if (DemandedElts == 1) {
43694 SDLoc dl(Op);
43695 MVT VT = Op.getSimpleValueType();
43696 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43697 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43698 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43699 SDValue CC = Op.getOperand(2);
43700 if (Opc == X86ISD::CMPM) {
43701 SDValue Cmp =
43702 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43703 return TLO.CombineTo(
43704 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43705 }
43706 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43707 return TLO.CombineTo(Op,
43708 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43709 }
43710 break;
43711 }
43712 case X86ISD::PCMPEQ:
43713 case X86ISD::PCMPGT: {
43714 APInt LHSUndef, LHSZero;
43715 APInt RHSUndef, RHSZero;
43716 SDValue LHS = Op.getOperand(0);
43717 SDValue RHS = Op.getOperand(1);
43718 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43719 Depth + 1))
43720 return true;
43721 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43722 Depth + 1))
43723 return true;
43724 break;
43725 }
43726 case X86ISD::KSHIFTL: {
43727 SDValue Src = Op.getOperand(0);
43728 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43729 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43730 unsigned ShiftAmt = Amt->getZExtValue();
43731
43732 if (ShiftAmt == 0)
43733 return TLO.CombineTo(Op, Src);
43734
43735 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43736 // single shift. We can do this if the bottom bits (which are shifted
43737 // out) are never demanded.
43738 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43739 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43740 unsigned C1 = Src.getConstantOperandVal(1);
43741 unsigned NewOpc = X86ISD::KSHIFTL;
43742 int Diff = ShiftAmt - C1;
43743 if (Diff < 0) {
43744 Diff = -Diff;
43745 NewOpc = X86ISD::KSHIFTR;
43746 }
43747
43748 SDLoc dl(Op);
43749 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43750 return TLO.CombineTo(
43751 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43752 }
43753 }
43754
43755 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43756 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43757 Depth + 1))
43758 return true;
43759
43760 KnownUndef <<= ShiftAmt;
43761 KnownZero <<= ShiftAmt;
43762 KnownZero.setLowBits(ShiftAmt);
43763 break;
43764 }
43765 case X86ISD::KSHIFTR: {
43766 SDValue Src = Op.getOperand(0);
43767 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43768 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43769 unsigned ShiftAmt = Amt->getZExtValue();
43770
43771 if (ShiftAmt == 0)
43772 return TLO.CombineTo(Op, Src);
43773
43774 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43775 // single shift. We can do this if the top bits (which are shifted
43776 // out) are never demanded.
43777 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43778 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43779 unsigned C1 = Src.getConstantOperandVal(1);
43780 unsigned NewOpc = X86ISD::KSHIFTR;
43781 int Diff = ShiftAmt - C1;
43782 if (Diff < 0) {
43783 Diff = -Diff;
43784 NewOpc = X86ISD::KSHIFTL;
43785 }
43786
43787 SDLoc dl(Op);
43788 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43789 return TLO.CombineTo(
43790 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43791 }
43792 }
43793
43794 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43795 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43796 Depth + 1))
43797 return true;
43798
43799 KnownUndef.lshrInPlace(ShiftAmt);
43800 KnownZero.lshrInPlace(ShiftAmt);
43801 KnownZero.setHighBits(ShiftAmt);
43802 break;
43803 }
43804 case X86ISD::ANDNP: {
43805 // ANDNP = (~LHS & RHS);
43806 SDValue LHS = Op.getOperand(0);
43807 SDValue RHS = Op.getOperand(1);
43808
43809 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43810 APInt UndefElts;
43811 SmallVector<APInt> EltBits;
43812 int NumElts = VT.getVectorNumElements();
43813 int EltSizeInBits = VT.getScalarSizeInBits();
43814 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43815 APInt OpElts = DemandedElts;
43816 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43817 EltBits)) {
43818 OpBits.clearAllBits();
43819 OpElts.clearAllBits();
43820 for (int I = 0; I != NumElts; ++I) {
43821 if (!DemandedElts[I])
43822 continue;
43823 if (UndefElts[I]) {
43824 // We can't assume an undef src element gives an undef dst - the
43825 // other src might be zero.
43826 OpBits.setAllBits();
43827 OpElts.setBit(I);
43828 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43829 (!Invert && !EltBits[I].isZero())) {
43830 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43831 OpElts.setBit(I);
43832 }
43833 }
43834 }
43835 return std::make_pair(OpBits, OpElts);
43836 };
43837 APInt BitsLHS, EltsLHS;
43838 APInt BitsRHS, EltsRHS;
43839 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43840 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43841
43842 APInt LHSUndef, LHSZero;
43843 APInt RHSUndef, RHSZero;
43844 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43845 Depth + 1))
43846 return true;
43847 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43848 Depth + 1))
43849 return true;
43850
43851 if (!DemandedElts.isAllOnes()) {
43852 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43853 TLO.DAG, Depth + 1);
43854 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43855 TLO.DAG, Depth + 1);
43856 if (NewLHS || NewRHS) {
43857 NewLHS = NewLHS ? NewLHS : LHS;
43858 NewRHS = NewRHS ? NewRHS : RHS;
43859 return TLO.CombineTo(
43860 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43861 }
43862 }
43863 break;
43864 }
43865 case X86ISD::CVTSI2P:
43866 case X86ISD::CVTUI2P:
43867 case X86ISD::CVTPH2PS:
43868 case X86ISD::CVTPS2PH: {
43869 SDValue Src = Op.getOperand(0);
43870 EVT SrcVT = Src.getValueType();
43871 APInt SrcUndef, SrcZero;
43872 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43873 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43874 Depth + 1))
43875 return true;
43876 break;
43877 }
43878 case X86ISD::PACKSS:
43879 case X86ISD::PACKUS: {
43880 SDValue N0 = Op.getOperand(0);
43881 SDValue N1 = Op.getOperand(1);
43882
43883 APInt DemandedLHS, DemandedRHS;
43884 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43885
43886 APInt LHSUndef, LHSZero;
43887 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43888 Depth + 1))
43889 return true;
43890 APInt RHSUndef, RHSZero;
43891 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43892 Depth + 1))
43893 return true;
43894
43895 // TODO - pass on known zero/undef.
43896
43897 // Aggressively peek through ops to get at the demanded elts.
43898 // TODO - we should do this for all target/faux shuffles ops.
43899 if (!DemandedElts.isAllOnes()) {
43900 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43901 TLO.DAG, Depth + 1);
43902 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43903 TLO.DAG, Depth + 1);
43904 if (NewN0 || NewN1) {
43905 NewN0 = NewN0 ? NewN0 : N0;
43906 NewN1 = NewN1 ? NewN1 : N1;
43907 return TLO.CombineTo(Op,
43908 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43909 }
43910 }
43911 break;
43912 }
43913 case X86ISD::HADD:
43914 case X86ISD::HSUB:
43915 case X86ISD::FHADD:
43916 case X86ISD::FHSUB: {
43917 SDValue N0 = Op.getOperand(0);
43918 SDValue N1 = Op.getOperand(1);
43919
43920 APInt DemandedLHS, DemandedRHS;
43921 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43922
43923 APInt LHSUndef, LHSZero;
43924 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43925 Depth + 1))
43926 return true;
43927 APInt RHSUndef, RHSZero;
43928 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43929 Depth + 1))
43930 return true;
43931
43932 // TODO - pass on known zero/undef.
43933
43934 // Aggressively peek through ops to get at the demanded elts.
43935 // TODO: Handle repeated operands.
43936 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43937 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43938 TLO.DAG, Depth + 1);
43939 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43940 TLO.DAG, Depth + 1);
43941 if (NewN0 || NewN1) {
43942 NewN0 = NewN0 ? NewN0 : N0;
43943 NewN1 = NewN1 ? NewN1 : N1;
43944 return TLO.CombineTo(Op,
43945 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43946 }
43947 }
43948 break;
43949 }
43950 case X86ISD::VTRUNC:
43951 case X86ISD::VTRUNCS:
43952 case X86ISD::VTRUNCUS: {
43953 SDValue Src = Op.getOperand(0);
43954 MVT SrcVT = Src.getSimpleValueType();
43955 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43956 APInt SrcUndef, SrcZero;
43957 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43958 Depth + 1))
43959 return true;
43960 KnownZero = SrcZero.zextOrTrunc(NumElts);
43961 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43962 break;
43963 }
43964 case X86ISD::BLENDI: {
43965 SmallVector<int, 16> BlendMask;
43966 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43968 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43969 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43970 return TLO.CombineTo(Op, R);
43971 break;
43972 }
43973 case X86ISD::BLENDV: {
43974 APInt SelUndef, SelZero;
43975 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43976 SelZero, TLO, Depth + 1))
43977 return true;
43978
43979 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43980 APInt LHSUndef, LHSZero;
43981 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43982 LHSZero, TLO, Depth + 1))
43983 return true;
43984
43985 APInt RHSUndef, RHSZero;
43986 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43987 RHSZero, TLO, Depth + 1))
43988 return true;
43989
43990 KnownZero = LHSZero & RHSZero;
43991 KnownUndef = LHSUndef & RHSUndef;
43992 break;
43993 }
43994 case X86ISD::VZEXT_MOVL: {
43995 // If upper demanded elements are already zero then we have nothing to do.
43996 SDValue Src = Op.getOperand(0);
43997 APInt DemandedUpperElts = DemandedElts;
43998 DemandedUpperElts.clearLowBits(1);
43999 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
44000 return TLO.CombineTo(Op, Src);
44001 break;
44002 }
44003 case X86ISD::VZEXT_LOAD: {
44004 // If upper demanded elements are not demanded then simplify to a
44005 // scalar_to_vector(load()).
44007 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
44008 SDLoc DL(Op);
44009 auto *Mem = cast<MemSDNode>(Op);
44010 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
44011 Mem->getMemOperand());
44012 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
44013 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
44014 }
44015 break;
44016 }
44017 case X86ISD::VBROADCAST: {
44018 SDValue Src = Op.getOperand(0);
44019 MVT SrcVT = Src.getSimpleValueType();
44020 // Don't bother broadcasting if we just need the 0'th element.
44021 if (DemandedElts == 1) {
44022 if (!SrcVT.isVector())
44023 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44024 else if (Src.getValueType() != VT)
44025 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44026 SDLoc(Op));
44027 return TLO.CombineTo(Op, Src);
44028 }
44029 if (!SrcVT.isVector())
44030 break;
44031 APInt SrcUndef, SrcZero;
44032 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44033 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44034 Depth + 1))
44035 return true;
44036 // Aggressively peek through src to get at the demanded elt.
44037 // TODO - we should do this for all target/faux shuffles ops.
44039 Src, SrcElts, TLO.DAG, Depth + 1))
44040 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44041 break;
44042 }
44043 case X86ISD::VPERMV:
44044 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44045 Depth))
44046 return true;
44047 break;
44048 case X86ISD::PSHUFB:
44049 case X86ISD::VPERMV3:
44050 case X86ISD::VPERMILPV:
44051 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44052 Depth))
44053 return true;
44054 break;
44055 case X86ISD::VPPERM:
44056 case X86ISD::VPERMIL2:
44057 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44058 Depth))
44059 return true;
44060 break;
44061 }
44062
44063 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44064 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44065 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44066 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44067 DemandedElts.lshr(NumElts / 2) == 0) {
44068 unsigned SizeInBits = VT.getSizeInBits();
44069 unsigned ExtSizeInBits = SizeInBits / 2;
44070
44071 // See if 512-bit ops only use the bottom 128-bits.
44072 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44073 ExtSizeInBits = SizeInBits / 4;
44074
44075 switch (Opc) {
44076 // Scalar broadcast.
44077 case X86ISD::VBROADCAST: {
44078 SDLoc DL(Op);
44079 SDValue Src = Op.getOperand(0);
44080 if (Src.getValueSizeInBits() > ExtSizeInBits)
44081 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44082 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44083 ExtSizeInBits / VT.getScalarSizeInBits());
44084 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44085 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44086 TLO.DAG, DL, ExtSizeInBits));
44087 }
44089 SDLoc DL(Op);
44090 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44091 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44092 ExtSizeInBits / VT.getScalarSizeInBits());
44093 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44094 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44095 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44096 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44097 MemIntr->getMemOperand());
44099 Bcst.getValue(1));
44100 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44101 TLO.DAG, DL, ExtSizeInBits));
44102 }
44103 // Subvector broadcast.
44105 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44106 EVT MemVT = MemIntr->getMemoryVT();
44107 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44108 SDLoc DL(Op);
44109 SDValue Ld =
44110 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44111 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44113 Ld.getValue(1));
44114 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44115 TLO.DAG, DL, ExtSizeInBits));
44116 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44117 SDLoc DL(Op);
44118 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44119 ExtSizeInBits / VT.getScalarSizeInBits());
44120 if (SDValue BcstLd =
44121 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44122 return TLO.CombineTo(Op,
44123 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44124 TLO.DAG, DL, ExtSizeInBits));
44125 }
44126 break;
44127 }
44128 // Byte shifts by immediate.
44129 case X86ISD::VSHLDQ:
44130 case X86ISD::VSRLDQ:
44131 // Shift by uniform.
44132 case X86ISD::VSHL:
44133 case X86ISD::VSRL:
44134 case X86ISD::VSRA:
44135 // Shift by immediate.
44136 case X86ISD::VSHLI:
44137 case X86ISD::VSRLI:
44138 case X86ISD::VSRAI: {
44139 SDLoc DL(Op);
44140 SDValue Ext0 =
44141 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44142 SDValue ExtOp =
44143 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44144 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44145 SDValue Insert =
44146 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44147 return TLO.CombineTo(Op, Insert);
44148 }
44149 case X86ISD::VPERMI: {
44150 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44151 // TODO: This should be done in shuffle combining.
44152 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44154 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44155 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44156 SDLoc DL(Op);
44157 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44158 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44159 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44160 return TLO.CombineTo(Op, Insert);
44161 }
44162 }
44163 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44164 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44165 SDLoc DL(Op);
44166 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44167 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44168 Op.getOperand(1));
44169 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44170 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44171 return TLO.CombineTo(Op, Insert);
44172 }
44173 break;
44174 }
44175 case X86ISD::VPERMV: {
44178 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44179 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44180 VT == MVT::v16f32) &&
44181 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44182 // For lane-crossing shuffles, only split in half in case we're still
44183 // referencing higher elements.
44184 unsigned HalfElts = NumElts / 2;
44185 unsigned HalfSize = SizeInBits / 2;
44186 Mask.resize(HalfElts);
44187 if (all_of(Mask,
44188 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44190 SDLoc DL(Op);
44191 SDValue Ext;
44192 SDValue M =
44193 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44194 SDValue V =
44195 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44196 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44197 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44198 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44199 else {
44201 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44202 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44203 TLO.DAG.getBitcast(ShufVT, V), M);
44204 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44205 }
44206 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44207 Subtarget, TLO.DAG, DL, SizeInBits);
44208 return TLO.CombineTo(Op, Insert);
44209 }
44210 }
44211 break;
44212 }
44213 case X86ISD::VPERMV3: {
44216 if (Subtarget.hasVLX() &&
44217 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44218 // For lane-crossing shuffles, only split in half in case we're still
44219 // referencing higher elements.
44220 unsigned HalfElts = NumElts / 2;
44221 unsigned HalfSize = SizeInBits / 2;
44222 Mask.resize(HalfElts);
44223 if (all_of(Mask, [&](int M) {
44224 return isUndefOrInRange(M, 0, HalfElts) ||
44225 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44226 })) {
44227 // Adjust mask elements for 2nd operand to point to half width.
44228 for (int &M : Mask)
44229 M = (M < NumElts) ? M : (M - HalfElts);
44231 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44232 SDLoc DL(Op);
44233 SDValue Ext = TLO.DAG.getNode(
44234 Opc, DL, HalfVT,
44235 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44236 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44237 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44238 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44239 Subtarget, TLO.DAG, DL, SizeInBits);
44240 return TLO.CombineTo(Op, Insert);
44241 }
44242 }
44243 break;
44244 }
44245 case X86ISD::VPERM2X128: {
44246 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44247 SDLoc DL(Op);
44248 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44249 if (LoMask & 0x8)
44250 return TLO.CombineTo(
44251 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44252 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44253 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44254 SDValue ExtOp =
44255 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44256 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44257 SDValue Insert =
44258 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44259 return TLO.CombineTo(Op, Insert);
44260 }
44261 // Conversions.
44262 // TODO: Add more CVT opcodes when we have test coverage.
44263 case X86ISD::CVTTP2UI: {
44264 if (!Subtarget.hasVLX())
44265 break;
44266 [[fallthrough]];
44267 }
44268 case X86ISD::CVTTP2SI: {
44269 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44270 !Subtarget.hasVLX())
44271 break;
44272 [[fallthrough]];
44273 }
44274 case X86ISD::CVTPH2PS: {
44275 SDLoc DL(Op);
44276 unsigned Scale = SizeInBits / ExtSizeInBits;
44277 SDValue SrcOp = Op.getOperand(0);
44278 MVT SrcVT = SrcOp.getSimpleValueType();
44279 unsigned SrcExtSize =
44280 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44282 ExtSizeInBits / VT.getScalarSizeInBits());
44283 SDValue ExtOp = TLO.DAG.getNode(
44284 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44285 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44286 SDValue Insert =
44287 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44288 return TLO.CombineTo(Op, Insert);
44289 }
44290 // Zero upper elements.
44291 case X86ISD::VZEXT_MOVL:
44292 // Variable blend.
44293 case X86ISD::BLENDV:
44294 // Target unary shuffles:
44295 case X86ISD::MOVDDUP:
44296 // Target unary shuffles by immediate:
44297 case X86ISD::PSHUFD:
44298 case X86ISD::PSHUFLW:
44299 case X86ISD::PSHUFHW:
44300 case X86ISD::VPERMILPI:
44301 // (Non-Lane Crossing) Target Shuffles.
44302 case X86ISD::VPERMILPV:
44303 case X86ISD::VPERMIL2:
44304 case X86ISD::PSHUFB:
44305 case X86ISD::UNPCKL:
44306 case X86ISD::UNPCKH:
44307 case X86ISD::BLENDI:
44308 // Integer ops.
44309 case X86ISD::PACKSS:
44310 case X86ISD::PACKUS:
44311 case X86ISD::PCMPEQ:
44312 case X86ISD::PCMPGT:
44313 case X86ISD::PMULUDQ:
44314 case X86ISD::PMULDQ:
44315 case X86ISD::VSHLV:
44316 case X86ISD::VSRLV:
44317 case X86ISD::VSRAV:
44318 // Float ops.
44319 case X86ISD::FMAX:
44320 case X86ISD::FMIN:
44321 case X86ISD::FMAXC:
44322 case X86ISD::FMINC:
44323 case X86ISD::FRSQRT:
44324 case X86ISD::FRCP:
44325 // Horizontal Ops.
44326 case X86ISD::HADD:
44327 case X86ISD::HSUB:
44328 case X86ISD::FHADD:
44329 case X86ISD::FHSUB: {
44330 SDLoc DL(Op);
44332 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44333 SDValue SrcOp = Op.getOperand(i);
44334 EVT SrcVT = SrcOp.getValueType();
44335 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44336 "Unsupported vector size");
44337 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44338 ExtSizeInBits)
44339 : SrcOp);
44340 }
44341 MVT ExtVT = VT.getSimpleVT();
44342 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44343 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44344 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44345 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44346 SDValue Insert =
44347 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44348 return TLO.CombineTo(Op, Insert);
44349 }
44350 }
44351 }
44352
44353 // For splats, unless we *only* demand the 0'th element,
44354 // stop attempts at simplification here, we aren't going to improve things,
44355 // this is better than any potential shuffle.
44356 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44357 return false;
44358
44359 // Get target/faux shuffle mask.
44360 APInt OpUndef, OpZero;
44361 SmallVector<int, 64> OpMask;
44362 SmallVector<SDValue, 2> OpInputs;
44363 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44364 OpZero, TLO.DAG, Depth, false))
44365 return false;
44366
44367 // Shuffle inputs must be the same size as the result.
44368 if (OpMask.size() != (unsigned)NumElts ||
44369 llvm::any_of(OpInputs, [VT](SDValue V) {
44370 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44371 !V.getValueType().isVector();
44372 }))
44373 return false;
44374
44375 KnownZero = OpZero;
44376 KnownUndef = OpUndef;
44377
44378 // Check if shuffle mask can be simplified to undef/zero/identity.
44379 int NumSrcs = OpInputs.size();
44380 for (int i = 0; i != NumElts; ++i)
44381 if (!DemandedElts[i])
44382 OpMask[i] = SM_SentinelUndef;
44383
44384 if (isUndefInRange(OpMask, 0, NumElts)) {
44385 KnownUndef.setAllBits();
44386 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44387 }
44388 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44389 KnownZero.setAllBits();
44390 return TLO.CombineTo(
44391 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44392 }
44393 for (int Src = 0; Src != NumSrcs; ++Src)
44394 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44395 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44396
44397 // Attempt to simplify inputs.
44398 for (int Src = 0; Src != NumSrcs; ++Src) {
44399 // TODO: Support inputs of different types.
44400 if (OpInputs[Src].getValueType() != VT)
44401 continue;
44402
44403 int Lo = Src * NumElts;
44404 APInt SrcElts = APInt::getZero(NumElts);
44405 for (int i = 0; i != NumElts; ++i)
44406 if (DemandedElts[i]) {
44407 int M = OpMask[i] - Lo;
44408 if (0 <= M && M < NumElts)
44409 SrcElts.setBit(M);
44410 }
44411
44412 // TODO - Propagate input undef/zero elts.
44413 APInt SrcUndef, SrcZero;
44414 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44415 TLO, Depth + 1))
44416 return true;
44417 }
44418
44419 // If we don't demand all elements, then attempt to combine to a simpler
44420 // shuffle.
44421 // We need to convert the depth to something combineX86ShufflesRecursively
44422 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44423 // to match. This prevents combineX86ShuffleChain from returning a
44424 // combined shuffle that's the same as the original root, causing an
44425 // infinite loop.
44426 if (!DemandedElts.isAllOnes()) {
44427 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44428
44429 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44430 for (int i = 0; i != NumElts; ++i)
44431 if (DemandedElts[i])
44432 DemandedMask[i] = i;
44433
44435 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44437 /*AllowVariableCrossLaneMask=*/true,
44438 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44439 TLO.DAG, SDLoc(Op), Subtarget);
44440 if (NewShuffle)
44441 return TLO.CombineTo(Op, NewShuffle);
44442 }
44443
44444 return false;
44445}
44446
44448 SDValue Op, const APInt &OriginalDemandedBits,
44449 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44450 unsigned Depth) const {
44451 EVT VT = Op.getValueType();
44452 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44453 unsigned Opc = Op.getOpcode();
44454 switch(Opc) {
44455 case X86ISD::VTRUNC: {
44456 KnownBits KnownOp;
44457 SDValue Src = Op.getOperand(0);
44458 MVT SrcVT = Src.getSimpleValueType();
44459
44460 // Simplify the input, using demanded bit information.
44461 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44462 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44463 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44464 return true;
44465 break;
44466 }
44467 case X86ISD::PMULDQ:
44468 case X86ISD::PMULUDQ: {
44469 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44470 KnownBits KnownLHS, KnownRHS;
44471 SDValue LHS = Op.getOperand(0);
44472 SDValue RHS = Op.getOperand(1);
44473
44474 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44475 // FIXME: Can we bound this better?
44476 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44477 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44478 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44479
44480 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44481 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44482 DemandedMaskLHS = DemandedMask;
44483 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44484 DemandedMaskRHS = DemandedMask;
44485
44486 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44487 KnownLHS, TLO, Depth + 1))
44488 return true;
44489 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44490 KnownRHS, TLO, Depth + 1))
44491 return true;
44492
44493 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44494 KnownRHS = KnownRHS.trunc(32);
44495 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44496 KnownRHS.getConstant().isOne()) {
44497 SDLoc DL(Op);
44498 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44499 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44500 }
44501
44502 // Aggressively peek through ops to get at the demanded low bits.
44504 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44506 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44507 if (DemandedLHS || DemandedRHS) {
44508 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44509 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44510 return TLO.CombineTo(
44511 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44512 }
44513 break;
44514 }
44515 case X86ISD::ANDNP: {
44516 KnownBits Known2;
44517 SDValue Op0 = Op.getOperand(0);
44518 SDValue Op1 = Op.getOperand(1);
44519
44520 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44521 Known, TLO, Depth + 1))
44522 return true;
44523
44524 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44525 OriginalDemandedElts, Known2, TLO, Depth + 1))
44526 return true;
44527
44528 // If the RHS is a constant, see if we can simplify it.
44529 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44530 OriginalDemandedElts, TLO))
44531 return true;
44532
44533 // ANDNP = (~Op0 & Op1);
44534 Known.One &= Known2.Zero;
44535 Known.Zero |= Known2.One;
44536 break;
44537 }
44538 case X86ISD::VSHLI: {
44539 SDValue Op0 = Op.getOperand(0);
44540 SDValue Op1 = Op.getOperand(1);
44541
44542 unsigned ShAmt = Op1->getAsZExtVal();
44543 if (ShAmt >= BitWidth)
44544 break;
44545
44546 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44547
44548 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44549 // single shift. We can do this if the bottom bits (which are shifted
44550 // out) are never demanded.
44551 if (Op0.getOpcode() == X86ISD::VSRLI &&
44552 OriginalDemandedBits.countr_zero() >= ShAmt) {
44553 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44554 if (Shift2Amt < BitWidth) {
44555 int Diff = ShAmt - Shift2Amt;
44556 if (Diff == 0)
44557 return TLO.CombineTo(Op, Op0.getOperand(0));
44558
44559 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44560 SDValue NewShift = TLO.DAG.getNode(
44561 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44562 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44563 return TLO.CombineTo(Op, NewShift);
44564 }
44565 }
44566
44567 // If we are only demanding sign bits then we can use the shift source directly.
44568 unsigned NumSignBits =
44569 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44570 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44571 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44572 return TLO.CombineTo(Op, Op0);
44573
44574 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44575 TLO, Depth + 1))
44576 return true;
44577
44578 Known <<= ShAmt;
44579
44580 // Low bits known zero.
44581 Known.Zero.setLowBits(ShAmt);
44582
44583 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44584 // Attempt to avoid multi-use ops if we don't need anything from them.
44585 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44586 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44587 SDValue NewOp =
44588 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44589 return TLO.CombineTo(Op, NewOp);
44590 }
44591 }
44592 return false;
44593 }
44594 case X86ISD::VSRLI: {
44595 SDValue Op0 = Op.getOperand(0);
44596 SDValue Op1 = Op.getOperand(1);
44597
44598 unsigned ShAmt = Op1->getAsZExtVal();
44599 if (ShAmt >= BitWidth)
44600 break;
44601
44602 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44603
44604 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44605 TLO, Depth + 1))
44606 return true;
44607
44608 Known >>= ShAmt;
44609
44610 // High bits known zero.
44611 Known.Zero.setHighBits(ShAmt);
44612
44613 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44614 // Attempt to avoid multi-use ops if we don't need anything from them.
44615 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44616 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44617 SDValue NewOp =
44618 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44619 return TLO.CombineTo(Op, NewOp);
44620 }
44621 }
44622 return false;
44623 }
44624 case X86ISD::VSRAI: {
44625 SDValue Op0 = Op.getOperand(0);
44626 SDValue Op1 = Op.getOperand(1);
44627
44628 unsigned ShAmt = Op1->getAsZExtVal();
44629 if (ShAmt >= BitWidth)
44630 break;
44631
44632 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44633
44634 // If we just want the sign bit then we don't need to shift it.
44635 if (OriginalDemandedBits.isSignMask())
44636 return TLO.CombineTo(Op, Op0);
44637
44638 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44639 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44640 SDValue Op00 = Op0.getOperand(0);
44641 unsigned NumSignBits =
44642 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44643 if (ShAmt < NumSignBits)
44644 return TLO.CombineTo(Op, Op00);
44645 }
44646
44647 // If any of the demanded bits are produced by the sign extension, we also
44648 // demand the input sign bit.
44649 if (OriginalDemandedBits.countl_zero() < ShAmt)
44650 DemandedMask.setSignBit();
44651
44652 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44653 TLO, Depth + 1))
44654 return true;
44655
44656 Known >>= ShAmt;
44657
44658 // If the input sign bit is known to be zero, or if none of the top bits
44659 // are demanded, turn this into an unsigned shift right.
44660 if (Known.Zero[BitWidth - ShAmt - 1] ||
44661 OriginalDemandedBits.countl_zero() >= ShAmt)
44662 return TLO.CombineTo(
44663 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44664
44665 // High bits are known one.
44666 if (Known.One[BitWidth - ShAmt - 1])
44667 Known.One.setHighBits(ShAmt);
44668
44669 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44670 // Attempt to avoid multi-use ops if we don't need anything from them.
44671 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44672 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44673 SDValue NewOp =
44674 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44675 return TLO.CombineTo(Op, NewOp);
44676 }
44677 }
44678 return false;
44679 }
44680 case X86ISD::BLENDI: {
44681 SDValue LHS = Op.getOperand(0);
44682 SDValue RHS = Op.getOperand(1);
44683 APInt Mask = getBLENDIBlendMask(Op);
44684
44685 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44686 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44687 TLO, Depth + 1))
44688 return true;
44689
44690 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44691 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44692 TLO, Depth + 1))
44693 return true;
44694
44695 // Attempt to avoid multi-use ops if we don't need anything from them.
44697 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44699 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44700 if (NewLHS || NewRHS) {
44701 NewLHS = NewLHS ? NewLHS : LHS;
44702 NewRHS = NewRHS ? NewRHS : RHS;
44703 return TLO.CombineTo(Op,
44704 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44705 NewLHS, NewRHS, Op.getOperand(2)));
44706 }
44707 break;
44708 }
44709 case X86ISD::BLENDV: {
44710 SDValue Sel = Op.getOperand(0);
44711 SDValue LHS = Op.getOperand(1);
44712 SDValue RHS = Op.getOperand(2);
44713
44714 APInt SignMask = APInt::getSignMask(BitWidth);
44716 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44718 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44720 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44721
44722 if (NewSel || NewLHS || NewRHS) {
44723 NewSel = NewSel ? NewSel : Sel;
44724 NewLHS = NewLHS ? NewLHS : LHS;
44725 NewRHS = NewRHS ? NewRHS : RHS;
44726 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44727 NewSel, NewLHS, NewRHS));
44728 }
44729 break;
44730 }
44731 case X86ISD::PEXTRB:
44732 case X86ISD::PEXTRW: {
44733 SDValue Vec = Op.getOperand(0);
44734 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44735 MVT VecVT = Vec.getSimpleValueType();
44736 unsigned NumVecElts = VecVT.getVectorNumElements();
44737
44738 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44739 unsigned Idx = CIdx->getZExtValue();
44740 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44741
44742 // If we demand no bits from the vector then we must have demanded
44743 // bits from the implict zext - simplify to zero.
44744 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44745 if (DemandedVecBits == 0)
44746 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44747
44748 APInt KnownUndef, KnownZero;
44749 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44750 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44751 KnownZero, TLO, Depth + 1))
44752 return true;
44753
44754 KnownBits KnownVec;
44755 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44756 KnownVec, TLO, Depth + 1))
44757 return true;
44758
44760 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44761 return TLO.CombineTo(
44762 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44763
44764 Known = KnownVec.zext(BitWidth);
44765 return false;
44766 }
44767 break;
44768 }
44769 case X86ISD::PINSRB:
44770 case X86ISD::PINSRW: {
44771 SDValue Vec = Op.getOperand(0);
44772 SDValue Scl = Op.getOperand(1);
44773 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44774 MVT VecVT = Vec.getSimpleValueType();
44775
44776 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44777 unsigned Idx = CIdx->getZExtValue();
44778 if (!OriginalDemandedElts[Idx])
44779 return TLO.CombineTo(Op, Vec);
44780
44781 KnownBits KnownVec;
44782 APInt DemandedVecElts(OriginalDemandedElts);
44783 DemandedVecElts.clearBit(Idx);
44784 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44785 KnownVec, TLO, Depth + 1))
44786 return true;
44787
44788 KnownBits KnownScl;
44789 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44790 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44791 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44792 return true;
44793
44794 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44795 Known = KnownVec.intersectWith(KnownScl);
44796 return false;
44797 }
44798 break;
44799 }
44800 case X86ISD::PACKSS:
44801 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44802 // sign bit then we can just ask for the source operands sign bit.
44803 // TODO - add known bits handling.
44804 if (OriginalDemandedBits.isSignMask()) {
44805 APInt DemandedLHS, DemandedRHS;
44806 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44807
44808 KnownBits KnownLHS, KnownRHS;
44809 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44810 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44811 KnownLHS, TLO, Depth + 1))
44812 return true;
44813 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44814 KnownRHS, TLO, Depth + 1))
44815 return true;
44816
44817 // Attempt to avoid multi-use ops if we don't need anything from them.
44819 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44821 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44822 if (DemandedOp0 || DemandedOp1) {
44823 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44824 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44825 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44826 }
44827 }
44828 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44829 break;
44830 case X86ISD::VBROADCAST: {
44831 SDValue Src = Op.getOperand(0);
44832 MVT SrcVT = Src.getSimpleValueType();
44833 APInt DemandedElts = APInt::getOneBitSet(
44834 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44835 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44836 TLO, Depth + 1))
44837 return true;
44838 // If we don't need the upper bits, attempt to narrow the broadcast source.
44839 // Don't attempt this on AVX512 as it might affect broadcast folding.
44840 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44841 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44842 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44843 Src->hasOneUse()) {
44844 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44845 SDValue NewSrc =
44846 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44847 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44848 SDValue NewBcst =
44849 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44850 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44851 }
44852 break;
44853 }
44854 case X86ISD::PCMPGT:
44855 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44856 // iff we only need the sign bit then we can use R directly.
44857 if (OriginalDemandedBits.isSignMask() &&
44858 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44859 return TLO.CombineTo(Op, Op.getOperand(1));
44860 break;
44861 case X86ISD::MOVMSK: {
44862 SDValue Src = Op.getOperand(0);
44863 MVT SrcVT = Src.getSimpleValueType();
44864 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44865 unsigned NumElts = SrcVT.getVectorNumElements();
44866
44867 // If we don't need the sign bits at all just return zero.
44868 if (OriginalDemandedBits.countr_zero() >= NumElts)
44869 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44870
44871 // See if we only demand bits from the lower 128-bit vector.
44872 if (SrcVT.is256BitVector() &&
44873 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44874 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44875 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44876 }
44877
44878 // Only demand the vector elements of the sign bits we need.
44879 APInt KnownUndef, KnownZero;
44880 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44881 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44882 TLO, Depth + 1))
44883 return true;
44884
44885 Known.Zero = KnownZero.zext(BitWidth);
44886 Known.Zero.setHighBits(BitWidth - NumElts);
44887
44888 // MOVMSK only uses the MSB from each vector element.
44889 KnownBits KnownSrc;
44890 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44891 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44892 Depth + 1))
44893 return true;
44894
44895 if (KnownSrc.One[SrcBits - 1])
44896 Known.One.setLowBits(NumElts);
44897 else if (KnownSrc.Zero[SrcBits - 1])
44898 Known.Zero.setLowBits(NumElts);
44899
44900 // Attempt to avoid multi-use os if we don't need anything from it.
44902 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44903 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44904 return false;
44905 }
44906 case X86ISD::TESTP: {
44907 SDValue Op0 = Op.getOperand(0);
44908 SDValue Op1 = Op.getOperand(1);
44909 MVT OpVT = Op0.getSimpleValueType();
44910 assert((OpVT.getVectorElementType() == MVT::f32 ||
44911 OpVT.getVectorElementType() == MVT::f64) &&
44912 "Illegal vector type for X86ISD::TESTP");
44913
44914 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44915 KnownBits KnownSrc;
44916 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44917 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44918 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44919 AssumeSingleUse) ||
44920 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44921 AssumeSingleUse);
44922 }
44923 case X86ISD::CMOV: {
44924 KnownBits Known2;
44925 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44926 OriginalDemandedElts, Known2, TLO, Depth + 1))
44927 return true;
44928 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44929 OriginalDemandedElts, Known, TLO, Depth + 1))
44930 return true;
44931
44932 // Only known if known in both the LHS and RHS.
44933 Known = Known.intersectWith(Known2);
44934 return false;
44935 }
44936 case X86ISD::BEXTR:
44937 case X86ISD::BEXTRI: {
44938 SDValue Op0 = Op.getOperand(0);
44939 SDValue Op1 = Op.getOperand(1);
44940
44941 // Only bottom 16-bits of the control bits are required.
44942 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44943 // NOTE: SimplifyDemandedBits won't do this for constants.
44944 uint64_t Val1 = Cst1->getZExtValue();
44945 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44946 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44947 SDLoc DL(Op);
44948 return TLO.CombineTo(
44949 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44950 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44951 }
44952
44953 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44954 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44955
44956 // If the length is 0, the result is 0.
44957 if (Length == 0) {
44958 Known.setAllZero();
44959 return false;
44960 }
44961
44962 if ((Shift + Length) <= BitWidth) {
44963 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44964 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44965 return true;
44966
44967 Known = Known.extractBits(Length, Shift);
44968 Known = Known.zextOrTrunc(BitWidth);
44969 return false;
44970 }
44971 } else {
44972 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44973 KnownBits Known1;
44974 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44975 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44976 return true;
44977
44978 // If the length is 0, replace with 0.
44979 KnownBits LengthBits = Known1.extractBits(8, 8);
44980 if (LengthBits.isZero())
44981 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44982 }
44983
44984 break;
44985 }
44986 case X86ISD::PDEP: {
44987 SDValue Op0 = Op.getOperand(0);
44988 SDValue Op1 = Op.getOperand(1);
44989
44990 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44991 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44992
44993 // If the demanded bits has leading zeroes, we don't demand those from the
44994 // mask.
44995 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44996 return true;
44997
44998 // The number of possible 1s in the mask determines the number of LSBs of
44999 // operand 0 used. Undemanded bits from the mask don't matter so filter
45000 // them before counting.
45001 KnownBits Known2;
45002 uint64_t Count = (~Known.Zero & LoMask).popcount();
45003 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
45004 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
45005 return true;
45006
45007 // Zeroes are retained from the mask, but not ones.
45008 Known.One.clearAllBits();
45009 // The result will have at least as many trailing zeros as the non-mask
45010 // operand since bits can only map to the same or higher bit position.
45011 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
45012 return false;
45013 }
45014 case X86ISD::VPMADD52L:
45015 case X86ISD::VPMADD52H: {
45016 KnownBits KnownOp0, KnownOp1, KnownOp2;
45017 SDValue Op0 = Op.getOperand(0);
45018 SDValue Op1 = Op.getOperand(1);
45019 SDValue Op2 = Op.getOperand(2);
45020 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45021 // operand 2).
45022 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45023 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45024 TLO, Depth + 1))
45025 return true;
45026
45027 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45028 TLO, Depth + 1))
45029 return true;
45030
45031 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45032 KnownOp2, TLO, Depth + 1))
45033 return true;
45034
45035 KnownBits KnownMul;
45036 KnownOp0 = KnownOp0.trunc(52);
45037 KnownOp1 = KnownOp1.trunc(52);
45038 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45039 : KnownBits::mulhu(KnownOp0, KnownOp1);
45040 KnownMul = KnownMul.zext(64);
45041
45042 // lo/hi(X * Y) + Z --> C + Z
45043 if (KnownMul.isConstant()) {
45044 SDLoc DL(Op);
45045 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45046 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45047 }
45048
45049 Known = KnownBits::add(KnownMul, KnownOp2);
45050 return false;
45051 }
45052 }
45053
45055 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45056}
45057
45059 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45060 SelectionDAG &DAG, unsigned Depth) const {
45061 int NumElts = DemandedElts.getBitWidth();
45062 unsigned Opc = Op.getOpcode();
45063 EVT VT = Op.getValueType();
45064
45065 switch (Opc) {
45066 case X86ISD::PINSRB:
45067 case X86ISD::PINSRW: {
45068 // If we don't demand the inserted element, return the base vector.
45069 SDValue Vec = Op.getOperand(0);
45070 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45071 MVT VecVT = Vec.getSimpleValueType();
45072 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45073 !DemandedElts[CIdx->getZExtValue()])
45074 return Vec;
45075 break;
45076 }
45077 case X86ISD::VSHLI: {
45078 // If we are only demanding sign bits then we can use the shift source
45079 // directly.
45080 SDValue Op0 = Op.getOperand(0);
45081 unsigned ShAmt = Op.getConstantOperandVal(1);
45082 unsigned BitWidth = DemandedBits.getBitWidth();
45083 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45084 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45085 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45086 return Op0;
45087 break;
45088 }
45089 case X86ISD::VSRAI:
45090 // iff we only need the sign bit then we can use the source directly.
45091 // TODO: generalize where we only demand extended signbits.
45092 if (DemandedBits.isSignMask())
45093 return Op.getOperand(0);
45094 break;
45095 case X86ISD::PCMPGT:
45096 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45097 // iff we only need the sign bit then we can use R directly.
45098 if (DemandedBits.isSignMask() &&
45099 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45100 return Op.getOperand(1);
45101 break;
45102 case X86ISD::BLENDV: {
45103 // BLENDV: Cond (MSB) ? LHS : RHS
45104 SDValue Cond = Op.getOperand(0);
45105 SDValue LHS = Op.getOperand(1);
45106 SDValue RHS = Op.getOperand(2);
45107
45108 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45109 if (CondKnown.isNegative())
45110 return LHS;
45111 if (CondKnown.isNonNegative())
45112 return RHS;
45113 break;
45114 }
45115 case X86ISD::ANDNP: {
45116 // ANDNP = (~LHS & RHS);
45117 SDValue LHS = Op.getOperand(0);
45118 SDValue RHS = Op.getOperand(1);
45119
45120 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45121 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45122
45123 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45124 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45125 // this context, so return RHS.
45126 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45127 return RHS;
45128 break;
45129 }
45130 }
45131
45132 APInt ShuffleUndef, ShuffleZero;
45133 SmallVector<int, 16> ShuffleMask;
45135 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45136 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45137 // If all the demanded elts are from one operand and are inline,
45138 // then we can use the operand directly.
45139 int NumOps = ShuffleOps.size();
45140 if (ShuffleMask.size() == (unsigned)NumElts &&
45142 return VT.getSizeInBits() == V.getValueSizeInBits();
45143 })) {
45144
45145 if (DemandedElts.isSubsetOf(ShuffleUndef))
45146 return DAG.getUNDEF(VT);
45147 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45148 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45149
45150 // Bitmask that indicates which ops have only been accessed 'inline'.
45151 APInt IdentityOp = APInt::getAllOnes(NumOps);
45152 for (int i = 0; i != NumElts; ++i) {
45153 int M = ShuffleMask[i];
45154 if (!DemandedElts[i] || ShuffleUndef[i])
45155 continue;
45156 int OpIdx = M / NumElts;
45157 int EltIdx = M % NumElts;
45158 if (M < 0 || EltIdx != i) {
45159 IdentityOp.clearAllBits();
45160 break;
45161 }
45162 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45163 if (IdentityOp == 0)
45164 break;
45165 }
45166 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45167 "Multiple identity shuffles detected");
45168
45169 if (IdentityOp != 0)
45170 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45171 }
45172 }
45173
45175 Op, DemandedBits, DemandedElts, DAG, Depth);
45176}
45177
45179 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45180 bool PoisonOnly, unsigned Depth) const {
45181 unsigned NumElts = DemandedElts.getBitWidth();
45182
45183 switch (Op.getOpcode()) {
45185 case X86ISD::Wrapper:
45186 case X86ISD::WrapperRIP:
45187 return true;
45188 case X86ISD::BLENDI:
45189 case X86ISD::PSHUFB:
45190 case X86ISD::PSHUFD:
45191 case X86ISD::UNPCKL:
45192 case X86ISD::UNPCKH:
45193 case X86ISD::VPERMILPV:
45194 case X86ISD::VPERMILPI:
45195 case X86ISD::VPERMV:
45196 case X86ISD::VPERMV3: {
45199 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45200 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45201 APInt::getZero(NumElts));
45202 for (auto M : enumerate(Mask)) {
45203 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45204 continue;
45205 if (M.value() == SM_SentinelUndef)
45206 return false;
45207 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45208 "Shuffle mask index out of range");
45209 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45210 }
45211 for (auto Op : enumerate(Ops))
45212 if (!DemandedSrcElts[Op.index()].isZero() &&
45214 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45215 return false;
45216 return true;
45217 }
45218 break;
45219 }
45220 }
45222 Op, DemandedElts, DAG, PoisonOnly, Depth);
45223}
45224
45226 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45227 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45228
45229 switch (Op.getOpcode()) {
45230 // SSE bit logic.
45231 case X86ISD::FAND:
45232 case X86ISD::FOR:
45233 case X86ISD::FXOR:
45234 case X86ISD::FANDN:
45235 case X86ISD::ANDNP:
45236 case X86ISD::VPTERNLOG:
45237 return false;
45238 // SSE vector insert/extracts use modulo indices.
45239 case X86ISD::PINSRB:
45240 case X86ISD::PINSRW:
45241 case X86ISD::PEXTRB:
45242 case X86ISD::PEXTRW:
45243 return false;
45244 // SSE vector multiplies are either inbounds or saturate.
45245 case X86ISD::VPMADDUBSW:
45246 case X86ISD::VPMADDWD:
45247 return false;
45248 // SSE vector shifts handle out of bounds shift amounts.
45249 case X86ISD::VSHLI:
45250 case X86ISD::VSRLI:
45251 case X86ISD::VSRAI:
45252 return false;
45253 // SSE blends.
45254 case X86ISD::BLENDI:
45255 case X86ISD::BLENDV:
45256 return false;
45257 // SSE target shuffles.
45258 case X86ISD::PSHUFB:
45259 case X86ISD::PSHUFD:
45260 case X86ISD::UNPCKL:
45261 case X86ISD::UNPCKH:
45262 case X86ISD::VPERMILPV:
45263 case X86ISD::VPERMILPI:
45264 case X86ISD::VPERMV:
45265 case X86ISD::VPERMV3:
45266 return false;
45267 // SSE comparisons handle all icmp/fcmp cases.
45268 // TODO: Add CMPM/MM with test coverage.
45269 case X86ISD::CMPP:
45270 case X86ISD::PCMPEQ:
45271 case X86ISD::PCMPGT:
45272 return false;
45273 // SSE signbit extraction.
45274 case X86ISD::MOVMSK:
45275 return false;
45276 // GFNI instructions.
45279 case X86ISD::GF2P8MULB:
45280 return false;
45282 switch (Op->getConstantOperandVal(0)) {
45283 case Intrinsic::x86_sse2_pmadd_wd:
45284 case Intrinsic::x86_avx2_pmadd_wd:
45285 case Intrinsic::x86_avx512_pmaddw_d_512:
45286 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45287 case Intrinsic::x86_avx2_pmadd_ub_sw:
45288 case Intrinsic::x86_avx512_pmaddubs_w_512:
45289 return false;
45290 case Intrinsic::x86_avx512_vpermi2var_d_128:
45291 case Intrinsic::x86_avx512_vpermi2var_d_256:
45292 case Intrinsic::x86_avx512_vpermi2var_d_512:
45293 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45294 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45295 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45296 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45297 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45298 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45299 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45300 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45301 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45302 case Intrinsic::x86_avx512_vpermi2var_q_128:
45303 case Intrinsic::x86_avx512_vpermi2var_q_256:
45304 case Intrinsic::x86_avx512_vpermi2var_q_512:
45305 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45306 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45307 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45308 return false;
45309 }
45310 }
45312 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45313}
45314
45316 const APInt &DemandedElts,
45317 APInt &UndefElts,
45318 const SelectionDAG &DAG,
45319 unsigned Depth) const {
45320 unsigned NumElts = DemandedElts.getBitWidth();
45321 unsigned Opc = Op.getOpcode();
45322
45323 switch (Opc) {
45324 case X86ISD::VBROADCAST:
45326 UndefElts = APInt::getZero(NumElts);
45327 return true;
45328 }
45329
45330 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45331 DAG, Depth);
45332}
45333
45334// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45335// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45336static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45337 bool AllowTruncate, unsigned Depth) {
45338 // Limit recursion.
45340 return false;
45341 switch (Src.getOpcode()) {
45342 case ISD::TRUNCATE:
45343 if (!AllowTruncate)
45344 return false;
45345 [[fallthrough]];
45346 case ISD::SETCC:
45347 return Src.getOperand(0).getValueSizeInBits() == Size;
45348 case ISD::FREEZE:
45349 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45350 Depth + 1);
45351 case ISD::AND:
45352 case ISD::XOR:
45353 case ISD::OR:
45354 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45355 Depth + 1) &&
45356 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45357 Depth + 1);
45358 case ISD::SELECT:
45359 case ISD::VSELECT:
45360 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45361 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45362 Depth + 1) &&
45363 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45364 Depth + 1);
45365 case ISD::BUILD_VECTOR:
45366 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45367 ISD::isBuildVectorAllOnes(Src.getNode());
45368 }
45369 return false;
45370}
45371
45372// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45373static unsigned getAltBitOpcode(unsigned Opcode) {
45374 switch(Opcode) {
45375 // clang-format off
45376 case ISD::AND: return X86ISD::FAND;
45377 case ISD::OR: return X86ISD::FOR;
45378 case ISD::XOR: return X86ISD::FXOR;
45379 case X86ISD::ANDNP: return X86ISD::FANDN;
45380 // clang-format on
45381 }
45382 llvm_unreachable("Unknown bitwise opcode");
45383}
45384
45385// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45387 const SDLoc &DL) {
45388 EVT SrcVT = Src.getValueType();
45389 if (SrcVT != MVT::v4i1)
45390 return SDValue();
45391
45392 switch (Src.getOpcode()) {
45393 case ISD::SETCC:
45394 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45395 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45396 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45397 SDValue Op0 = Src.getOperand(0);
45398 if (ISD::isNormalLoad(Op0.getNode()))
45399 return DAG.getBitcast(MVT::v4f32, Op0);
45400 if (Op0.getOpcode() == ISD::BITCAST &&
45401 Op0.getOperand(0).getValueType() == MVT::v4f32)
45402 return Op0.getOperand(0);
45403 }
45404 break;
45405 case ISD::AND:
45406 case ISD::XOR:
45407 case ISD::OR: {
45408 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45409 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45410 if (Op0 && Op1)
45411 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45412 Op1);
45413 break;
45414 }
45415 }
45416 return SDValue();
45417}
45418
45419// Helper to push sign extension of vXi1 SETCC result through bitops.
45421 SDValue Src, const SDLoc &DL) {
45422 switch (Src.getOpcode()) {
45423 case ISD::SETCC:
45424 case ISD::FREEZE:
45425 case ISD::TRUNCATE:
45426 case ISD::BUILD_VECTOR:
45427 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45428 case ISD::AND:
45429 case ISD::XOR:
45430 case ISD::OR:
45431 return DAG.getNode(
45432 Src.getOpcode(), DL, SExtVT,
45433 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45434 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45435 case ISD::SELECT:
45436 case ISD::VSELECT:
45437 return DAG.getSelect(
45438 DL, SExtVT, Src.getOperand(0),
45439 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45440 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45441 }
45442 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45443}
45444
45445// Try to match patterns such as
45446// (i16 bitcast (v16i1 x))
45447// ->
45448// (i16 movmsk (16i8 sext (v16i1 x)))
45449// before the illegal vector is scalarized on subtargets that don't have legal
45450// vxi1 types.
45452 const SDLoc &DL,
45453 const X86Subtarget &Subtarget) {
45454 EVT SrcVT = Src.getValueType();
45455 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45456 return SDValue();
45457
45458 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45459 // legalization destroys the v4i32 type.
45460 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45461 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45462 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45463 DAG.getBitcast(MVT::v4f32, V));
45464 return DAG.getZExtOrTrunc(V, DL, VT);
45465 }
45466 }
45467
45468 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45469 // movmskb even with avx512. This will be better than truncating to vXi1 and
45470 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45471 // vpcmpeqb/vpcmpgtb.
45472 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45473 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45474 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45475 Src.getOperand(0).getValueType() == MVT::v64i8);
45476
45477 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45478 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45479 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45480 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45481 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45482 EVT CmpVT = Src.getOperand(0).getValueType();
45483 EVT EltVT = CmpVT.getVectorElementType();
45484 if (CmpVT.getSizeInBits() <= 256 &&
45485 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45486 PreferMovMsk = true;
45487 }
45488
45489 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45490 // MOVMSK is supported in SSE2 or later.
45491 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45492 return SDValue();
45493
45494 // If the upper ops of a concatenation are undef, then try to bitcast the
45495 // lower op and extend.
45496 SmallVector<SDValue, 4> SubSrcOps;
45497 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45498 SubSrcOps.size() >= 2) {
45499 SDValue LowerOp = SubSrcOps[0];
45500 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45501 if (LowerOp.getOpcode() == ISD::SETCC &&
45502 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45503 EVT SubVT = VT.getIntegerVT(
45504 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45505 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45506 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45507 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45508 }
45509 }
45510 }
45511
45512 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45513 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45514 // v8i16 and v16i16.
45515 // For these two cases, we can shuffle the upper element bytes to a
45516 // consecutive sequence at the start of the vector and treat the results as
45517 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45518 // for v16i16 this is not the case, because the shuffle is expensive, so we
45519 // avoid sign-extending to this type entirely.
45520 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45521 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45522 MVT SExtVT;
45523 bool PropagateSExt = false;
45524 switch (SrcVT.getSimpleVT().SimpleTy) {
45525 default:
45526 return SDValue();
45527 case MVT::v2i1:
45528 SExtVT = MVT::v2i64;
45529 break;
45530 case MVT::v4i1:
45531 SExtVT = MVT::v4i32;
45532 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45533 // sign-extend to a 256-bit operation to avoid truncation.
45534 if (Subtarget.hasAVX() &&
45535 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45536 SExtVT = MVT::v4i64;
45537 PropagateSExt = true;
45538 }
45539 break;
45540 case MVT::v8i1:
45541 SExtVT = MVT::v8i16;
45542 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45543 // sign-extend to a 256-bit operation to match the compare.
45544 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45545 // 256-bit because the shuffle is cheaper than sign extending the result of
45546 // the compare.
45547 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45548 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45549 SExtVT = MVT::v8i32;
45550 PropagateSExt = true;
45551 }
45552 break;
45553 case MVT::v16i1:
45554 SExtVT = MVT::v16i8;
45555 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45556 // it is not profitable to sign-extend to 256-bit because this will
45557 // require an extra cross-lane shuffle which is more expensive than
45558 // truncating the result of the compare to 128-bits.
45559 break;
45560 case MVT::v32i1:
45561 SExtVT = MVT::v32i8;
45562 break;
45563 case MVT::v64i1:
45564 // If we have AVX512F, but not AVX512BW and the input is truncated from
45565 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45566 if (Subtarget.hasAVX512()) {
45567 if (Subtarget.hasBWI())
45568 return SDValue();
45569 SExtVT = MVT::v64i8;
45570 break;
45571 }
45572 // Split if this is a <64 x i8> comparison result.
45573 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45574 SExtVT = MVT::v64i8;
45575 break;
45576 }
45577 return SDValue();
45578 };
45579
45580 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45581 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45582
45583 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45584 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45585 } else {
45586 if (SExtVT == MVT::v8i16) {
45587 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45588 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45589 }
45590 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45591 }
45592
45593 EVT IntVT =
45595 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45596 return DAG.getBitcast(VT, V);
45597}
45598
45599// Convert a vXi1 constant build vector to the same width scalar integer.
45601 EVT SrcVT = Op.getValueType();
45602 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45603 "Expected a vXi1 vector");
45605 "Expected a constant build vector");
45606
45607 APInt Imm(SrcVT.getVectorNumElements(), 0);
45608 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45609 SDValue In = Op.getOperand(Idx);
45610 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45611 Imm.setBit(Idx);
45612 }
45613 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45614 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45615}
45616
45619 const X86Subtarget &Subtarget) {
45620 using namespace SDPatternMatch;
45621 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45622
45623 if (!DCI.isBeforeLegalizeOps())
45624 return SDValue();
45625
45626 // Only do this if we have k-registers.
45627 if (!Subtarget.hasAVX512())
45628 return SDValue();
45629
45630 EVT DstVT = N->getValueType(0);
45631 SDValue Op = N->getOperand(0);
45632 EVT SrcVT = Op.getValueType();
45633
45634 // Make sure we have a bitcast between mask registers and a scalar type.
45635 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45636 DstVT.isScalarInteger()) &&
45637 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45638 SrcVT.isScalarInteger()))
45639 return SDValue();
45640
45641 SDValue LHS, RHS;
45642
45643 // Look for logic ops.
45645 return SDValue();
45646
45647 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45648 // least one of the getBitcast() will fold away).
45649 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45651 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45652 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45653
45654 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45655 // Most of these have to move a constant from the scalar domain anyway.
45658 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45659 DAG.getBitcast(DstVT, LHS), RHS);
45660 }
45661
45662 return SDValue();
45663}
45664
45666 const X86Subtarget &Subtarget) {
45667 SDLoc DL(BV);
45668 unsigned NumElts = BV->getNumOperands();
45669 SDValue Splat = BV->getSplatValue();
45670
45671 // Build MMX element from integer GPR or SSE float values.
45672 auto CreateMMXElement = [&](SDValue V) {
45673 if (V.isUndef())
45674 return DAG.getUNDEF(MVT::x86mmx);
45675 if (V.getValueType().isFloatingPoint()) {
45676 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45677 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45678 V = DAG.getBitcast(MVT::v2i64, V);
45679 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45680 }
45681 V = DAG.getBitcast(MVT::i32, V);
45682 } else {
45683 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45684 }
45685 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45686 };
45687
45688 // Convert build vector ops to MMX data in the bottom elements.
45690
45691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45692
45693 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45694 if (Splat) {
45695 if (Splat.isUndef())
45696 return DAG.getUNDEF(MVT::x86mmx);
45697
45698 Splat = CreateMMXElement(Splat);
45699
45700 if (Subtarget.hasSSE1()) {
45701 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45702 if (NumElts == 8)
45703 Splat = DAG.getNode(
45704 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45705 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45706 TLI.getPointerTy(DAG.getDataLayout())),
45707 Splat, Splat);
45708
45709 // Use PSHUFW to repeat 16-bit elements.
45710 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45711 return DAG.getNode(
45712 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45713 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45714 TLI.getPointerTy(DAG.getDataLayout())),
45715 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45716 }
45717 Ops.append(NumElts, Splat);
45718 } else {
45719 for (unsigned i = 0; i != NumElts; ++i)
45720 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45721 }
45722
45723 // Use tree of PUNPCKLs to build up general MMX vector.
45724 while (Ops.size() > 1) {
45725 unsigned NumOps = Ops.size();
45726 unsigned IntrinOp =
45727 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45728 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45729 : Intrinsic::x86_mmx_punpcklbw));
45730 SDValue Intrin = DAG.getTargetConstant(
45731 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45732 for (unsigned i = 0; i != NumOps; i += 2)
45733 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45734 Ops[i], Ops[i + 1]);
45735 Ops.resize(NumOps / 2);
45736 }
45737
45738 return Ops[0];
45739}
45740
45741// Recursive function that attempts to find if a bool vector node was originally
45742// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45743// integer. If so, replace the scalar ops with bool vector equivalents back down
45744// the chain.
45746 SelectionDAG &DAG,
45747 const X86Subtarget &Subtarget,
45748 unsigned Depth = 0) {
45750 return SDValue(); // Limit search depth.
45751
45752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45753 unsigned Opc = V.getOpcode();
45754 switch (Opc) {
45755 case ISD::BITCAST: {
45756 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45757 SDValue Src = V.getOperand(0);
45758 EVT SrcVT = Src.getValueType();
45759 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45760 return DAG.getBitcast(VT, Src);
45761 break;
45762 }
45763 case ISD::Constant: {
45764 auto *C = cast<ConstantSDNode>(V);
45765 if (C->isZero())
45766 return DAG.getConstant(0, DL, VT);
45767 if (C->isAllOnes())
45768 return DAG.getAllOnesConstant(DL, VT);
45769 break;
45770 }
45771 case ISD::TRUNCATE: {
45772 // If we find a suitable source, a truncated scalar becomes a subvector.
45773 SDValue Src = V.getOperand(0);
45774 EVT NewSrcVT =
45775 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45776 if (TLI.isTypeLegal(NewSrcVT))
45777 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45778 Subtarget, Depth + 1))
45779 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45780 DAG.getVectorIdxConstant(0, DL));
45781 break;
45782 }
45783 case ISD::ANY_EXTEND:
45784 case ISD::ZERO_EXTEND: {
45785 // If we find a suitable source, an extended scalar becomes a subvector.
45786 SDValue Src = V.getOperand(0);
45787 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45788 Src.getScalarValueSizeInBits());
45789 if (TLI.isTypeLegal(NewSrcVT))
45790 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45791 Subtarget, Depth + 1))
45792 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45793 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45794 : DAG.getConstant(0, DL, VT),
45795 N0, DAG.getVectorIdxConstant(0, DL));
45796 break;
45797 }
45798 case ISD::OR:
45799 case ISD::XOR: {
45800 // If we find suitable sources, we can just move the op to the vector
45801 // domain.
45802 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45803 Subtarget, Depth + 1))
45804 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45805 Subtarget, Depth + 1))
45806 return DAG.getNode(Opc, DL, VT, N0, N1);
45807 break;
45808 }
45809 case ISD::SHL: {
45810 // If we find a suitable source, a SHL becomes a KSHIFTL.
45811 SDValue Src0 = V.getOperand(0);
45812 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45813 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45814 break;
45815
45816 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45817 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45818 Depth + 1))
45819 return DAG.getNode(
45820 X86ISD::KSHIFTL, DL, VT, N0,
45821 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45822 break;
45823 }
45824 }
45825
45826 // Does the inner bitcast already exist?
45827 if (Depth > 0)
45828 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45829 return SDValue(Alt, 0);
45830
45831 return SDValue();
45832}
45833
45836 const X86Subtarget &Subtarget) {
45837 SDValue N0 = N->getOperand(0);
45838 EVT VT = N->getValueType(0);
45839 EVT SrcVT = N0.getValueType();
45840 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45841
45842 // Try to match patterns such as
45843 // (i16 bitcast (v16i1 x))
45844 // ->
45845 // (i16 movmsk (16i8 sext (v16i1 x)))
45846 // before the setcc result is scalarized on subtargets that don't have legal
45847 // vxi1 types.
45848 if (DCI.isBeforeLegalize()) {
45849 SDLoc dl(N);
45850 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45851 return V;
45852
45853 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45854 // type, widen both sides to avoid a trip through memory.
45855 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45856 Subtarget.hasAVX512()) {
45857 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45858 N0 = DAG.getBitcast(MVT::v8i1, N0);
45859 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45860 DAG.getVectorIdxConstant(0, dl));
45861 }
45862
45863 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45864 // type, widen both sides to avoid a trip through memory.
45865 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45866 Subtarget.hasAVX512()) {
45867 // Use zeros for the widening if we already have some zeroes. This can
45868 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45869 // stream of this.
45870 // FIXME: It might make sense to detect a concat_vectors with a mix of
45871 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45872 // a separate combine. What we can't do is canonicalize the operands of
45873 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45874 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45875 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45876 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45877 SrcVT = LastOp.getValueType();
45878 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45880 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45881 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45882 N0 = DAG.getBitcast(MVT::i8, N0);
45883 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45884 }
45885 }
45886
45887 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45888 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45889 Ops[0] = N0;
45890 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45891 N0 = DAG.getBitcast(MVT::i8, N0);
45892 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45893 }
45894 } else if (DCI.isAfterLegalizeDAG()) {
45895 // If we're bitcasting from iX to vXi1, see if the integer originally
45896 // began as a vXi1 and whether we can remove the bitcast entirely.
45897 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45898 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45899 if (SDValue V =
45900 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45901 return V;
45902 }
45903 }
45904
45905 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45906 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45907 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45908 // we can help with known bits propagation from the vXi1 domain to the
45909 // scalar domain.
45910 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45911 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45912 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45914 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45915 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45916
45917 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45918 // and the vbroadcast_load are both integer or both fp. In some cases this
45919 // will remove the bitcast entirely.
45920 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45921 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45922 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45923 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45924 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45925 // Don't swap i8/i16 since don't have fp types that size.
45926 if (MemSize >= 32) {
45927 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45928 : MVT::getIntegerVT(MemSize);
45929 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45930 : MVT::getIntegerVT(SrcVTSize);
45931 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45932
45933 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45934 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45935 SDValue ResNode =
45937 MemVT, BCast->getMemOperand());
45938 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45939 return DAG.getBitcast(VT, ResNode);
45940 }
45941 }
45942
45943 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45944 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45945 SDValue Src = peekThroughTruncates(N0);
45946 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45947 Src.getOperand(0).getValueSizeInBits() == 128 &&
45948 isNullConstant(Src.getOperand(1))) {
45949 SDLoc DL(N);
45950 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45951 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45952 DAG.getVectorIdxConstant(0, DL));
45953 }
45954 }
45955
45956 // Since MMX types are special and don't usually play with other vector types,
45957 // it's better to handle them early to be sure we emit efficient code by
45958 // avoiding store-load conversions.
45959 if (VT == MVT::x86mmx) {
45960 // Detect MMX constant vectors.
45961 APInt UndefElts;
45962 SmallVector<APInt, 1> EltBits;
45963 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45964 /*AllowWholeUndefs*/ true,
45965 /*AllowPartialUndefs*/ true)) {
45966 SDLoc DL(N0);
45967 // Handle zero-extension of i32 with MOVD.
45968 if (EltBits[0].countl_zero() >= 32)
45969 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45970 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45971 // Else, bitcast to a double.
45972 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45973 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45974 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45975 }
45976
45977 // Detect bitcasts to x86mmx low word.
45978 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45979 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45980 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45981 bool LowUndef = true, AllUndefOrZero = true;
45982 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45983 SDValue Op = N0.getOperand(i);
45984 LowUndef &= Op.isUndef() || (i >= e/2);
45985 AllUndefOrZero &= isNullConstantOrUndef(Op);
45986 }
45987 if (AllUndefOrZero) {
45988 SDValue N00 = N0.getOperand(0);
45989 SDLoc dl(N00);
45990 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45991 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45992 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45993 }
45994 }
45995
45996 // Detect bitcasts of 64-bit build vectors and convert to a
45997 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45998 // lowest element.
45999 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
46000 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
46001 SrcVT == MVT::v8i8))
46002 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
46003
46004 // Detect bitcasts between element or subvector extraction to x86mmx.
46005 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
46007 isNullConstant(N0.getOperand(1))) {
46008 SDValue N00 = N0.getOperand(0);
46009 if (N00.getValueType().is128BitVector())
46010 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
46011 DAG.getBitcast(MVT::v2i64, N00));
46012 }
46013
46014 // Detect bitcasts from FP_TO_SINT to x86mmx.
46015 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46016 SDLoc DL(N0);
46017 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46018 DAG.getUNDEF(MVT::v2i32));
46019 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46020 DAG.getBitcast(MVT::v2i64, Res));
46021 }
46022 }
46023
46024 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46025 // most of these to scalar anyway.
46026 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46027 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46029 return combinevXi1ConstantToInteger(N0, DAG);
46030 }
46031
46032 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46033 VT.getVectorElementType() == MVT::i1) {
46034 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46035 if (C->isAllOnes())
46036 return DAG.getConstant(1, SDLoc(N0), VT);
46037 if (C->isZero())
46038 return DAG.getConstant(0, SDLoc(N0), VT);
46039 }
46040 }
46041
46042 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46043 // Turn it into a sign bit compare that produces a k-register. This avoids
46044 // a trip through a GPR.
46045 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46046 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46048 unsigned NumElts = VT.getVectorNumElements();
46049 SDValue Src = N0;
46050
46051 // Peek through truncate.
46052 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46053 Src = N0.getOperand(0);
46054
46055 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46056 SDValue MovmskIn = Src.getOperand(0);
46057 MVT MovmskVT = MovmskIn.getSimpleValueType();
46058 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46059
46060 // We allow extra bits of the movmsk to be used since they are known zero.
46061 // We can't convert a VPMOVMSKB without avx512bw.
46062 if (MovMskElts <= NumElts &&
46063 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46064 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46065 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46066 SDLoc dl(N);
46067 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46068 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46069 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46070 if (EVT(CmpVT) == VT)
46071 return Cmp;
46072
46073 // Pad with zeroes up to original VT to replace the zeroes that were
46074 // being used from the MOVMSK.
46075 unsigned NumConcats = NumElts / MovMskElts;
46076 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46077 Ops[0] = Cmp;
46078 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46079 }
46080 }
46081 }
46082
46083 // Try to remove bitcasts from input and output of mask arithmetic to
46084 // remove GPR<->K-register crossings.
46085 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46086 return V;
46087
46088 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46089 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46090 SrcVT.getVectorNumElements() == 1)
46091 return N0.getOperand(1);
46092
46093 // Convert a bitcasted integer logic operation that has one bitcasted
46094 // floating-point operand into a floating-point logic operation. This may
46095 // create a load of a constant, but that is cheaper than materializing the
46096 // constant in an integer register and transferring it to an SSE register or
46097 // transferring the SSE operand to integer register and back.
46098 unsigned FPOpcode;
46099 switch (N0.getOpcode()) {
46100 // clang-format off
46101 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46102 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46103 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46104 default: return SDValue();
46105 // clang-format on
46106 }
46107
46108 // Check if we have a bitcast from another integer type as well.
46109 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46110 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46111 (Subtarget.hasFP16() && VT == MVT::f16) ||
46112 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46113 TLI.isTypeLegal(VT))))
46114 return SDValue();
46115
46116 SDValue LogicOp0 = N0.getOperand(0);
46117 SDValue LogicOp1 = N0.getOperand(1);
46118 SDLoc DL0(N0);
46119
46120 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46121 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46122 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46123 LogicOp0.getOperand(0).getValueType() == VT &&
46124 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46125 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46126 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46127 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46128 }
46129 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46130 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46131 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46132 LogicOp1.getOperand(0).getValueType() == VT &&
46133 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46134 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46135 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46136 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46137 }
46138
46139 return SDValue();
46140}
46141
46142// (mul (zext a), (sext, b))
46143static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46144 SDValue &Op1) {
46145 Op0 = Mul.getOperand(0);
46146 Op1 = Mul.getOperand(1);
46147
46148 // The operand1 should be signed extend
46149 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46150 std::swap(Op0, Op1);
46151
46152 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46153 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46154 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46155 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46156 return true;
46157
46158 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46159 return (BV && BV->isConstant());
46160 };
46161
46162 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46163 // value, we need to check Op0 is zero extended value. Op1 should be signed
46164 // value, so we just check the signed bits.
46165 if ((IsFreeTruncation(Op0) &&
46166 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46167 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46168 return true;
46169
46170 return false;
46171}
46172
46174 unsigned &LogBias, const SDLoc &DL,
46175 const X86Subtarget &Subtarget) {
46176 // Extend or truncate to MVT::i8 first.
46177 MVT Vi8VT =
46178 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46179 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46180 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46181
46182 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46183 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46184 // The src A, B element type is i8, but the dst C element type is i32.
46185 // When we calculate the reduce stage, we use src vector type vXi8 for it
46186 // so we need logbias 2 to avoid extra 2 stages.
46187 LogBias = 2;
46188
46189 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46190 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46191 RegSize = std::max(512u, RegSize);
46192
46193 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46194 // fill in the missing vector elements with 0.
46195 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46196 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46197 Ops[0] = LHS;
46198 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46199 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46200 Ops[0] = RHS;
46201 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46202
46203 // Actually build the DotProduct, split as 256/512 bits for
46204 // AVXVNNI/AVX512VNNI.
46205 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46207 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46208 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46209 };
46210 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46211 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46212
46213 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46214 DpBuilder, false);
46215}
46216
46217// Create a PSADBW given two sources representable as zexts of vXi8.
46219 const SDLoc &DL, const X86Subtarget &Subtarget) {
46220 // Find the appropriate width for the PSADBW.
46221 EVT DstVT = N0.getValueType();
46222 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46223 DstVT.getVectorElementCount());
46224 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46225
46226 // Widen the vXi8 vectors, padding with zero vector elements.
46227 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46228 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46229 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46230 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46231 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46232 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46233 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46234
46235 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46236 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46238 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46239 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46240 };
46241 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46242 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46243 PSADBWBuilder);
46244}
46245
46246// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46247// PHMINPOSUW.
46249 const X86Subtarget &Subtarget) {
46250 // Bail without SSE41.
46251 if (!Subtarget.hasSSE41())
46252 return SDValue();
46253
46254 EVT ExtractVT = Extract->getValueType(0);
46255 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46256 return SDValue();
46257
46258 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46259 ISD::NodeType BinOp;
46260 SDValue Src = DAG.matchBinOpReduction(
46261 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46262 if (!Src)
46263 return SDValue();
46264
46265 EVT SrcVT = Src.getValueType();
46266 EVT SrcSVT = SrcVT.getScalarType();
46267 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46268 return SDValue();
46269
46270 SDLoc DL(Extract);
46271 SDValue MinPos = Src;
46272
46273 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46274 while (SrcVT.getSizeInBits() > 128) {
46275 SDValue Lo, Hi;
46276 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46277 SrcVT = Lo.getValueType();
46278 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46279 }
46280 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46281 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46282 "Unexpected value type");
46283
46284 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46285 // to flip the value accordingly.
46286 SDValue Mask;
46287 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46288 if (BinOp == ISD::SMAX)
46289 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46290 else if (BinOp == ISD::SMIN)
46291 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46292 else if (BinOp == ISD::UMAX)
46293 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46294
46295 if (Mask)
46296 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46297
46298 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46299 // shuffling each upper element down and insert zeros. This means that the
46300 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46301 // ready for the PHMINPOS.
46302 if (ExtractVT == MVT::i8) {
46304 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46305 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46306 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46307 }
46308
46309 // Perform the PHMINPOS on a v8i16 vector,
46310 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46311 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46312 MinPos = DAG.getBitcast(SrcVT, MinPos);
46313
46314 if (Mask)
46315 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46316
46317 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46318 DAG.getVectorIdxConstant(0, DL));
46319}
46320
46321// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46323 const X86Subtarget &Subtarget) {
46324 // Bail without SSE2.
46325 if (!Subtarget.hasSSE2())
46326 return SDValue();
46327
46328 EVT ExtractVT = Extract->getValueType(0);
46329 unsigned BitWidth = ExtractVT.getSizeInBits();
46330 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46331 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46332 return SDValue();
46333
46334 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46335 ISD::NodeType BinOp;
46336 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46337 if (!Match && ExtractVT == MVT::i1)
46338 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46339 if (!Match)
46340 return SDValue();
46341
46342 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46343 // which we can't support here for now.
46344 if (Match.getScalarValueSizeInBits() != BitWidth)
46345 return SDValue();
46346
46347 SDValue Movmsk;
46348 SDLoc DL(Extract);
46349 EVT MatchVT = Match.getValueType();
46350 unsigned NumElts = MatchVT.getVectorNumElements();
46351 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46352 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46353 LLVMContext &Ctx = *DAG.getContext();
46354
46355 if (ExtractVT == MVT::i1) {
46356 // Special case for (pre-legalization) vXi1 reductions.
46357 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46358 return SDValue();
46359 if (Match.getOpcode() == ISD::SETCC) {
46360 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46361 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46362 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46363 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46364 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46365 X86::CondCode X86CC;
46366 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46367 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46368 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46369 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46370 DAG, X86CC))
46371 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46372 getSETCC(X86CC, V, DL, DAG));
46373 }
46374 }
46375 if (TLI.isTypeLegal(MatchVT)) {
46376 // If this is a legal AVX512 predicate type then we can just bitcast.
46377 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46378 Movmsk = DAG.getBitcast(MovmskVT, Match);
46379 } else {
46380 // Use combineBitcastvxi1 to create the MOVMSK.
46381 while (NumElts > MaxElts) {
46382 SDValue Lo, Hi;
46383 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46384 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46385 NumElts /= 2;
46386 }
46387 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46388 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46389 }
46390 if (!Movmsk)
46391 return SDValue();
46392 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46393 } else {
46394 // FIXME: Better handling of k-registers or 512-bit vectors?
46395 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46396 if (!(MatchSizeInBits == 128 ||
46397 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46398 return SDValue();
46399
46400 // Make sure this isn't a vector of 1 element. The perf win from using
46401 // MOVMSK diminishes with less elements in the reduction, but it is
46402 // generally better to get the comparison over to the GPRs as soon as
46403 // possible to reduce the number of vector ops.
46404 if (Match.getValueType().getVectorNumElements() < 2)
46405 return SDValue();
46406
46407 // Check that we are extracting a reduction of all sign bits.
46408 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46409 return SDValue();
46410
46411 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46412 SDValue Lo, Hi;
46413 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46414 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46415 MatchSizeInBits = Match.getValueSizeInBits();
46416 }
46417
46418 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46419 MVT MaskSrcVT;
46420 if (64 == BitWidth || 32 == BitWidth)
46422 MatchSizeInBits / BitWidth);
46423 else
46424 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46425
46426 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46427 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46428 NumElts = MaskSrcVT.getVectorNumElements();
46429 }
46430 assert((NumElts <= 32 || NumElts == 64) &&
46431 "Not expecting more than 64 elements");
46432
46433 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46434 if (BinOp == ISD::XOR) {
46435 // parity -> (PARITY(MOVMSK X))
46436 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46437 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46438 }
46439
46440 SDValue CmpC;
46441 ISD::CondCode CondCode;
46442 if (BinOp == ISD::OR) {
46443 // any_of -> MOVMSK != 0
46444 CmpC = DAG.getConstant(0, DL, CmpVT);
46445 CondCode = ISD::CondCode::SETNE;
46446 } else {
46447 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46448 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46449 DL, CmpVT);
46450 CondCode = ISD::CondCode::SETEQ;
46451 }
46452
46453 // The setcc produces an i8 of 0/1, so extend that to the result width and
46454 // negate to get the final 0/-1 mask value.
46455 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46456 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46457 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46458 return DAG.getNegative(Zext, DL, ExtractVT);
46459}
46460
46462 const X86Subtarget &Subtarget) {
46463 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46464 return SDValue();
46465
46466 EVT ExtractVT = Extract->getValueType(0);
46467 // Verify the type we're extracting is i32, as the output element type of
46468 // vpdpbusd is i32.
46469 if (ExtractVT != MVT::i32)
46470 return SDValue();
46471
46472 EVT VT = Extract->getOperand(0).getValueType();
46474 return SDValue();
46475
46476 // Match shuffle + add pyramid.
46477 ISD::NodeType BinOp;
46478 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46479
46480 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46481 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46482 // before adding into the accumulator.
46483 // TODO:
46484 // We also need to verify that the multiply has at least 2x the number of bits
46485 // of the input. We shouldn't match
46486 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46487 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46488 // Root = Root.getOperand(0);
46489
46490 // If there was a match, we want Root to be a mul.
46491 if (!Root || Root.getOpcode() != ISD::MUL)
46492 return SDValue();
46493
46494 // Check whether we have an extend and mul pattern
46495 SDValue LHS, RHS;
46496 if (!detectExtMul(DAG, Root, LHS, RHS))
46497 return SDValue();
46498
46499 // Create the dot product instruction.
46500 SDLoc DL(Extract);
46501 unsigned StageBias;
46502 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46503
46504 // If the original vector was wider than 4 elements, sum over the results
46505 // in the DP vector.
46506 unsigned Stages = Log2_32(VT.getVectorNumElements());
46507 EVT DpVT = DP.getValueType();
46508
46509 if (Stages > StageBias) {
46510 unsigned DpElems = DpVT.getVectorNumElements();
46511
46512 for (unsigned i = Stages - StageBias; i > 0; --i) {
46513 SmallVector<int, 16> Mask(DpElems, -1);
46514 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46515 Mask[j] = MaskEnd + j;
46516
46517 SDValue Shuffle =
46518 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46519 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46520 }
46521 }
46522
46523 // Return the lowest ExtractSizeInBits bits.
46524 EVT ResVT =
46525 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46526 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46527 DP = DAG.getBitcast(ResVT, DP);
46528 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46529 Extract->getOperand(1));
46530}
46531
46533 const X86Subtarget &Subtarget) {
46534 using namespace SDPatternMatch;
46535
46536 // PSADBW is only supported on SSE2 and up.
46537 if (!Subtarget.hasSSE2())
46538 return SDValue();
46539
46540 EVT ExtractVT = Extract->getValueType(0);
46541 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46542 ExtractVT != MVT::i64)
46543 return SDValue();
46544
46545 EVT VT = Extract->getOperand(0).getValueType();
46547 return SDValue();
46548
46549 // Match shuffle + add pyramid.
46550 ISD::NodeType BinOp;
46551 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46552 if (!Root)
46553 return SDValue();
46554
46555 // The operand is expected to be zero extended from i8.
46556 // In order to convert to i64 and above, additional any/zero/sign
46557 // extend is expected.
46558 // The zero extend from 32 bit has no mathematical effect on the result.
46559 // Also the sign extend is basically zero extend
46560 // (extends the sign bit which is zero).
46561 // So it is correct to skip the sign/zero extend instruction.
46562 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46563 Root.getOpcode() == ISD::ZERO_EXTEND ||
46564 Root.getOpcode() == ISD::ANY_EXTEND)
46565 Root = Root.getOperand(0);
46566
46567 // Check whether we have an vXi8 abdu pattern.
46568 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46569 SDValue Src0, Src1;
46570 if (!sd_match(
46571 Root,
46572 m_AnyOf(
46574 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46576 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46577 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46578 m_Abs(
46579 m_Sub(m_AllOf(m_Value(Src0),
46581 m_AllOf(m_Value(Src1),
46582 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46583 return SDValue();
46584
46585 // Create the SAD instruction.
46586 SDLoc DL(Extract);
46587 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46588
46589 // If the original vector was wider than 8 elements, sum over the results
46590 // in the SAD vector.
46591 unsigned Stages = Log2_32(VT.getVectorNumElements());
46592 EVT SadVT = SAD.getValueType();
46593 if (Stages > 3) {
46594 unsigned SadElems = SadVT.getVectorNumElements();
46595
46596 for(unsigned i = Stages - 3; i > 0; --i) {
46597 SmallVector<int, 16> Mask(SadElems, -1);
46598 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46599 Mask[j] = MaskEnd + j;
46600
46601 SDValue Shuffle =
46602 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46603 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46604 }
46605 }
46606
46607 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46608 // Return the lowest ExtractSizeInBits bits.
46609 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46610 SadVT.getSizeInBits() / ExtractSizeInBits);
46611 SAD = DAG.getBitcast(ResVT, SAD);
46612 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46613 Extract->getOperand(1));
46614}
46615
46616// If this extract is from a loaded vector value and will be used as an
46617// integer, that requires a potentially expensive XMM -> GPR transfer.
46618// Additionally, if we can convert to a scalar integer load, that will likely
46619// be folded into a subsequent integer op.
46620// Note: SrcVec might not have a VecVT type, but it must be the same size.
46621// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46622// to a single-use of the loaded vector. For the reasons above, we
46623// expect this to be profitable even if it creates an extra load.
46624static SDValue
46626 const SDLoc &dl, SelectionDAG &DAG,
46628 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46629 "Only EXTRACT_VECTOR_ELT supported so far");
46630
46631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46632 EVT VT = N->getValueType(0);
46633
46634 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46635 return Use->getOpcode() == ISD::STORE ||
46636 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46637 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46638 });
46639
46640 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46641 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46642 VecVT.getVectorElementType() == VT &&
46643 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46644 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46645 SDValue NewPtr = TLI.getVectorElementPointer(
46646 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46647 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46648 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46649 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46650 SDValue Load =
46651 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46652 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46653 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46654 return Load;
46655 }
46656
46657 return SDValue();
46658}
46659
46660// Attempt to peek through a target shuffle and extract the scalar from the
46661// source.
46664 const X86Subtarget &Subtarget) {
46665 if (DCI.isBeforeLegalizeOps())
46666 return SDValue();
46667
46668 SDLoc dl(N);
46669 SDValue Src = N->getOperand(0);
46670 SDValue Idx = N->getOperand(1);
46671
46672 EVT VT = N->getValueType(0);
46673 EVT SrcVT = Src.getValueType();
46674 EVT SrcSVT = SrcVT.getVectorElementType();
46675 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46676 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46677
46678 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46679 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46680 return SDValue();
46681
46682 const APInt &IdxC = N->getConstantOperandAPInt(1);
46683 if (IdxC.uge(NumSrcElts))
46684 return SDValue();
46685
46686 SDValue SrcBC = peekThroughBitcasts(Src);
46687
46688 // Handle extract(bitcast(broadcast(scalar_value))).
46689 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46690 SDValue SrcOp = SrcBC.getOperand(0);
46691 EVT SrcOpVT = SrcOp.getValueType();
46692 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46693 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46694 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46695 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46696 // TODO support non-zero offsets.
46697 if (Offset == 0) {
46698 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46699 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46700 return SrcOp;
46701 }
46702 }
46703 }
46704
46705 // If we're extracting a single element from a broadcast load and there are
46706 // no other users, just create a single load.
46708 SrcBC.hasOneUse()) {
46709 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46710 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46711 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46712 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46713 SDValue Load =
46714 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46715 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46716 MemIntr->getMemOperand()->getFlags());
46717 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46718 return Load;
46719 }
46720 }
46721
46722 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46723 // TODO: Move to DAGCombine?
46724 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46725 SrcBC.getValueType().isInteger() &&
46726 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46727 SrcBC.getScalarValueSizeInBits() ==
46728 SrcBC.getOperand(0).getValueSizeInBits()) {
46729 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46730 if (IdxC.ult(Scale)) {
46731 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46732 SDValue Scl = SrcBC.getOperand(0);
46733 EVT SclVT = Scl.getValueType();
46734 if (Offset) {
46735 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46736 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46737 }
46738 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46739 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46740 return Scl;
46741 }
46742 }
46743
46744 // Handle extract(truncate(x)) for 0'th index.
46745 // TODO: Treat this as a faux shuffle?
46746 // TODO: When can we use this for general indices?
46747 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46748 (SrcVT.getSizeInBits() % 128) == 0) {
46749 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46750 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46751 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46752 Idx);
46753 }
46754
46755 // We can only legally extract other elements from 128-bit vectors and in
46756 // certain circumstances, depending on SSE-level.
46757 // TODO: Investigate float/double extraction if it will be just stored.
46758 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46759 unsigned Idx) {
46760 EVT VecSVT = VecVT.getScalarType();
46761 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46762 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46763 VecSVT == MVT::i64)) {
46764 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46765 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46766 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46767 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46768 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46769 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46770 Idx &= (NumEltsPerLane - 1);
46771 }
46772 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46773 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46774 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46775 DAG.getBitcast(VecVT, Vec),
46776 DAG.getVectorIdxConstant(Idx, dl));
46777 }
46778 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46779 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46780 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46781 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46782 DAG.getTargetConstant(Idx, dl, MVT::i8));
46783 }
46784 return SDValue();
46785 };
46786
46787 // Resolve the target shuffle inputs and mask.
46790 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46791 return SDValue();
46792
46793 // Shuffle inputs must be the same size as the result.
46794 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46795 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46796 }))
46797 return SDValue();
46798
46799 // Attempt to narrow/widen the shuffle mask to the correct size.
46800 if (Mask.size() != NumSrcElts) {
46801 if ((NumSrcElts % Mask.size()) == 0) {
46802 SmallVector<int, 16> ScaledMask;
46803 int Scale = NumSrcElts / Mask.size();
46804 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46805 Mask = std::move(ScaledMask);
46806 } else if ((Mask.size() % NumSrcElts) == 0) {
46807 // Simplify Mask based on demanded element.
46808 int ExtractIdx = (int)IdxC.getZExtValue();
46809 int Scale = Mask.size() / NumSrcElts;
46810 int Lo = Scale * ExtractIdx;
46811 int Hi = Scale * (ExtractIdx + 1);
46812 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46813 if (i < Lo || Hi <= i)
46814 Mask[i] = SM_SentinelUndef;
46815
46816 SmallVector<int, 16> WidenedMask;
46817 while (Mask.size() > NumSrcElts &&
46818 canWidenShuffleElements(Mask, WidenedMask))
46819 Mask = std::move(WidenedMask);
46820 }
46821 }
46822
46823 // If narrowing/widening failed, see if we can extract+zero-extend.
46824 int ExtractIdx;
46825 EVT ExtractVT;
46826 if (Mask.size() == NumSrcElts) {
46827 ExtractIdx = Mask[IdxC.getZExtValue()];
46828 ExtractVT = SrcVT;
46829 } else {
46830 unsigned Scale = Mask.size() / NumSrcElts;
46831 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46832 return SDValue();
46833 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46834 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46835 return SDValue();
46836 ExtractIdx = Mask[ScaledIdx];
46837 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46838 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46839 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46840 "Failed to widen vector type");
46841 }
46842
46843 // If the shuffle source element is undef/zero then we can just accept it.
46844 if (ExtractIdx == SM_SentinelUndef)
46845 return DAG.getUNDEF(VT);
46846
46847 if (ExtractIdx == SM_SentinelZero)
46848 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46849 : DAG.getConstant(0, dl, VT);
46850
46851 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46852 ExtractIdx = ExtractIdx % Mask.size();
46853 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46854 return DAG.getZExtOrTrunc(V, dl, VT);
46855
46856 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46858 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46859 return V;
46860
46861 return SDValue();
46862}
46863
46864/// Extracting a scalar FP value from vector element 0 is free, so extract each
46865/// operand first, then perform the math as a scalar op.
46867 const X86Subtarget &Subtarget,
46869 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46870 SDValue Vec = ExtElt->getOperand(0);
46871 SDValue Index = ExtElt->getOperand(1);
46872 EVT VT = ExtElt->getValueType(0);
46873 EVT VecVT = Vec.getValueType();
46874
46875 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46876 // non-zero element because the shuffle+scalar op will be cheaper?
46877 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46878 return SDValue();
46879
46880 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46881 // extract, the condition code), so deal with those as a special-case.
46882 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46883 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46884 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46885 return SDValue();
46886
46887 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46888 SDLoc DL(ExtElt);
46889 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46890 Vec.getOperand(0), Index);
46891 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46892 Vec.getOperand(1), Index);
46893 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46894 }
46895
46896 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46897 VT != MVT::f64)
46898 return SDValue();
46899
46900 // Vector FP selects don't fit the pattern of FP math ops (because the
46901 // condition has a different type and we have to change the opcode), so deal
46902 // with those here.
46903 // FIXME: This is restricted to pre type legalization. If we loosen this we
46904 // need to convert vector bool to a scalar bool.
46905 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46906 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46907 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46908 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46909 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46910 SDLoc DL(ExtElt);
46913 Vec.getOperand(0), Index);
46914 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46915 Vec.getOperand(1), Index);
46916 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46917 Vec.getOperand(2), Index);
46918 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46919 }
46920
46921 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46922 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46923 // missed load folding and fma+fneg combining.
46924 switch (Vec.getOpcode()) {
46925 case ISD::FMA: // Begin 3 operands
46926 case ISD::FMAD:
46927 case ISD::FADD: // Begin 2 operands
46928 case ISD::FSUB:
46929 case ISD::FMUL:
46930 case ISD::FDIV:
46931 case ISD::FREM:
46932 case ISD::FCOPYSIGN:
46933 case ISD::FMINNUM:
46934 case ISD::FMAXNUM:
46935 case ISD::FMINNUM_IEEE:
46936 case ISD::FMAXNUM_IEEE:
46937 case ISD::FMAXIMUM:
46938 case ISD::FMINIMUM:
46939 case ISD::FMAXIMUMNUM:
46940 case ISD::FMINIMUMNUM:
46941 case X86ISD::FMAX:
46942 case X86ISD::FMIN:
46943 case ISD::FABS: // Begin 1 operand
46944 case ISD::FSQRT:
46945 case ISD::FRINT:
46946 case ISD::FCEIL:
46947 case ISD::FTRUNC:
46948 case ISD::FNEARBYINT:
46949 case ISD::FROUNDEVEN:
46950 case ISD::FROUND:
46951 case ISD::FFLOOR:
46952 case X86ISD::FRCP:
46953 case X86ISD::FRSQRT: {
46954 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46955 SDLoc DL(ExtElt);
46957 for (SDValue Op : Vec->ops())
46958 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46959 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46960 }
46961 default:
46962 return SDValue();
46963 }
46964 llvm_unreachable("All opcodes should return within switch");
46965}
46966
46967/// Try to convert a vector reduction sequence composed of binops and shuffles
46968/// into horizontal ops.
46970 const X86Subtarget &Subtarget) {
46971 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46972
46973 // We need at least SSE2 to anything here.
46974 if (!Subtarget.hasSSE2())
46975 return SDValue();
46976
46978 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46979 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46980 if (!Rdx)
46981 return SDValue();
46982
46983 SDValue Index = ExtElt->getOperand(1);
46984 assert(isNullConstant(Index) &&
46985 "Reduction doesn't end in an extract from index 0");
46986
46987 EVT VT = ExtElt->getValueType(0);
46988 EVT VecVT = Rdx.getValueType();
46989 if (VecVT.getScalarType() != VT)
46990 return SDValue();
46991
46992 SDLoc DL(ExtElt);
46993 unsigned NumElts = VecVT.getVectorNumElements();
46994 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46995
46996 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46997 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46998 if (V.getValueType() == MVT::v4i8) {
46999 if (ZeroExtend && Subtarget.hasSSE41()) {
47000 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
47001 DAG.getConstant(0, DL, MVT::v4i32),
47002 DAG.getBitcast(MVT::i32, V),
47003 DAG.getVectorIdxConstant(0, DL));
47004 return DAG.getBitcast(MVT::v16i8, V);
47005 }
47006 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
47007 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
47008 : DAG.getUNDEF(MVT::v4i8));
47009 }
47010 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
47011 DAG.getUNDEF(MVT::v8i8));
47012 };
47013
47014 // vXi8 mul reduction - promote to vXi16 mul reduction.
47015 if (Opc == ISD::MUL) {
47016 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47017 return SDValue();
47018 if (VecVT.getSizeInBits() >= 128) {
47019 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47020 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47021 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47022 Lo = DAG.getBitcast(WideVT, Lo);
47023 Hi = DAG.getBitcast(WideVT, Hi);
47024 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47025 while (Rdx.getValueSizeInBits() > 128) {
47026 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47027 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47028 }
47029 } else {
47030 Rdx = WidenToV16I8(Rdx, false);
47031 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47032 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47033 }
47034 if (NumElts >= 8)
47035 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47036 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47037 {4, 5, 6, 7, -1, -1, -1, -1}));
47038 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47039 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47040 {2, 3, -1, -1, -1, -1, -1, -1}));
47041 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47042 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47043 {1, -1, -1, -1, -1, -1, -1, -1}));
47044 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47045 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47046 }
47047
47048 // vXi8 add reduction - sub 128-bit vector.
47049 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47050 Rdx = WidenToV16I8(Rdx, true);
47051 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47052 DAG.getConstant(0, DL, MVT::v16i8));
47053 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47054 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47055 }
47056
47057 // Must be a >=128-bit vector with pow2 elements.
47058 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47059 return SDValue();
47060
47061 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47062 if (VT == MVT::i8) {
47063 while (Rdx.getValueSizeInBits() > 128) {
47064 SDValue Lo, Hi;
47065 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47066 VecVT = Lo.getValueType();
47067 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47068 }
47069 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47070
47072 MVT::v16i8, DL, Rdx, Rdx,
47073 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47074 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47075 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47076 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47077 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47078 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47079 }
47080
47081 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47082 // If the source vector values are 0-255, then we can use PSADBW to
47083 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47084 // TODO: See if its worth avoiding vXi16/i32 truncations?
47085 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47086 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47087 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47088 Subtarget.hasAVX512())) {
47089 if (Rdx.getValueType() == MVT::v8i16) {
47090 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47091 DAG.getUNDEF(MVT::v8i16));
47092 } else {
47093 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47094 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47095 if (ByteVT.getSizeInBits() < 128)
47096 Rdx = WidenToV16I8(Rdx, true);
47097 }
47098
47099 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47100 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47102 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47103 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47104 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47105 };
47106 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47107 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47108
47109 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47110 while (Rdx.getValueSizeInBits() > 128) {
47111 SDValue Lo, Hi;
47112 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47113 VecVT = Lo.getValueType();
47114 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47115 }
47116 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47117
47118 if (NumElts > 8) {
47119 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47120 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47121 }
47122
47123 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47124 Rdx = DAG.getBitcast(VecVT, Rdx);
47125 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47126 }
47127
47128 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47129 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47130 return SDValue();
47131
47132 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47133
47134 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47135 // across the whole vector, so we need an extract + hop preliminary stage.
47136 // This is the only step where the operands of the hop are not the same value.
47137 // TODO: We could extend this to handle 512-bit or even longer vectors.
47138 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47139 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47140 unsigned NumElts = VecVT.getVectorNumElements();
47141 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47142 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47143 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47144 VecVT = Rdx.getValueType();
47145 }
47146 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47147 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47148 return SDValue();
47149
47150 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47151 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47152 for (unsigned i = 0; i != ReductionSteps; ++i)
47153 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47154
47155 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47156}
47157
47158/// Detect vector gather/scatter index generation and convert it from being a
47159/// bunch of shuffles and extracts into a somewhat faster sequence.
47160/// For i686, the best sequence is apparently storing the value and loading
47161/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47164 const X86Subtarget &Subtarget) {
47165 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47166 return NewOp;
47167
47168 SDValue InputVector = N->getOperand(0);
47169 SDValue EltIdx = N->getOperand(1);
47170 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47171
47172 EVT SrcVT = InputVector.getValueType();
47173 EVT VT = N->getValueType(0);
47174 SDLoc dl(InputVector);
47175 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47176 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47177 unsigned NumEltBits = VT.getScalarSizeInBits();
47178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47179
47180 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47181 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47182
47183 // Integer Constant Folding.
47184 if (CIdx && VT.isInteger()) {
47185 APInt UndefVecElts;
47186 SmallVector<APInt, 16> EltBits;
47187 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47188 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47189 EltBits, /*AllowWholeUndefs*/ true,
47190 /*AllowPartialUndefs*/ false)) {
47191 uint64_t Idx = CIdx->getZExtValue();
47192 if (UndefVecElts[Idx])
47193 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47194 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47195 }
47196
47197 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47198 // Improves lowering of bool masks on rust which splits them into byte array.
47199 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47200 SDValue Src = peekThroughBitcasts(InputVector);
47201 if (Src.getValueType().getScalarType() == MVT::i1 &&
47202 TLI.isTypeLegal(Src.getValueType())) {
47203 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47204 SDValue Sub = DAG.getNode(
47205 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47206 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47207 return DAG.getBitcast(VT, Sub);
47208 }
47209 }
47210 }
47211
47212 if (IsPextr) {
47213 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47214 DCI))
47215 return SDValue(N, 0);
47216
47217 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47218 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47219 InputVector.getOpcode() == X86ISD::PINSRW) &&
47220 InputVector.getOperand(2) == EltIdx) {
47221 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47222 "Vector type mismatch");
47223 SDValue Scl = InputVector.getOperand(1);
47224 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47225 return DAG.getZExtOrTrunc(Scl, dl, VT);
47226 }
47227
47228 // TODO - Remove this once we can handle the implicit zero-extension of
47229 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47230 // combineBasicSADPattern.
47231 return SDValue();
47232 }
47233
47234 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47235 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47236 InputVector.getOpcode() == ISD::BITCAST &&
47237 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47238 isNullConstant(EltIdx) && InputVector.hasOneUse())
47239 return DAG.getBitcast(VT, InputVector);
47240
47241 // Detect mmx to i32 conversion through a v2i32 elt extract.
47242 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47243 InputVector.getOpcode() == ISD::BITCAST &&
47244 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47245 isNullConstant(EltIdx) && InputVector.hasOneUse())
47246 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47247 InputVector.getOperand(0));
47248
47249 // Check whether this extract is the root of a sum of absolute differences
47250 // pattern. This has to be done here because we really want it to happen
47251 // pre-legalization,
47252 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47253 return SAD;
47254
47255 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47256 return VPDPBUSD;
47257
47258 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47259 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47260 return Cmp;
47261
47262 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47263 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47264 return MinMax;
47265
47266 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47267 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47268 return V;
47269
47270 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47271 return V;
47272
47273 if (CIdx)
47275 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47276 dl, DAG, DCI))
47277 return V;
47278
47279 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47280 // and then testing the relevant element.
47281 //
47282 // Note that we only combine extracts on the *same* result number, i.e.
47283 // t0 = merge_values a0, a1, a2, a3
47284 // i1 = extract_vector_elt t0, Constant:i64<2>
47285 // i1 = extract_vector_elt t0, Constant:i64<3>
47286 // but not
47287 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47288 // since the latter would need its own MOVMSK.
47289 if (SrcVT.getScalarType() == MVT::i1) {
47290 bool IsVar = !CIdx;
47291 SmallVector<SDNode *, 16> BoolExtracts;
47292 unsigned ResNo = InputVector.getResNo();
47293 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47294 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47295 Use->getOperand(0).getResNo() == ResNo &&
47296 Use->getValueType(0) == MVT::i1) {
47297 BoolExtracts.push_back(Use);
47298 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47299 return true;
47300 }
47301 return false;
47302 };
47303 // TODO: Can we drop the oneuse check for constant extracts?
47304 if (all_of(InputVector->users(), IsBoolExtract) &&
47305 (IsVar || BoolExtracts.size() > 1)) {
47306 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47307 if (SDValue BC =
47308 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47309 for (SDNode *Use : BoolExtracts) {
47310 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47311 // Mask = 1 << MaskIdx
47312 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47313 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47314 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47315 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47316 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47317 DCI.CombineTo(Use, Res);
47318 }
47319 return SDValue(N, 0);
47320 }
47321 }
47322 }
47323
47324 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47325 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47326 SDValue TruncSrc = InputVector.getOperand(0);
47327 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47328 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47329 SDValue NewExt =
47330 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47331 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47332 }
47333 }
47334
47335 return SDValue();
47336}
47337
47338// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47339// This is more or less the reverse of combineBitcastvxi1.
47341 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47342 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47343 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47344 Opcode != ISD::ANY_EXTEND)
47345 return SDValue();
47346 if (!DCI.isBeforeLegalizeOps())
47347 return SDValue();
47348 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47349 return SDValue();
47350
47351 EVT SVT = VT.getScalarType();
47352 EVT InSVT = N0.getValueType().getScalarType();
47353 unsigned EltSizeInBits = SVT.getSizeInBits();
47354
47355 // Input type must be extending a bool vector (bit-casted from a scalar
47356 // integer) to legal integer types.
47357 if (!VT.isVector())
47358 return SDValue();
47359 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47360 return SDValue();
47361 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47362 return SDValue();
47363
47364 SDValue N00 = N0.getOperand(0);
47365 EVT SclVT = N00.getValueType();
47366 if (!SclVT.isScalarInteger())
47367 return SDValue();
47368
47369 SDValue Vec;
47370 SmallVector<int> ShuffleMask;
47371 unsigned NumElts = VT.getVectorNumElements();
47372 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47373
47374 // Broadcast the scalar integer to the vector elements.
47375 if (NumElts > EltSizeInBits) {
47376 // If the scalar integer is greater than the vector element size, then we
47377 // must split it down into sub-sections for broadcasting. For example:
47378 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47379 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47380 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47381 unsigned Scale = NumElts / EltSizeInBits;
47382 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47383 bool UseBroadcast = Subtarget.hasInt256() &&
47384 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47385 Vec = UseBroadcast
47386 ? DAG.getSplat(BroadcastVT, DL, N00)
47387 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47388 Vec = DAG.getBitcast(VT, Vec);
47389
47390 for (unsigned i = 0; i != Scale; ++i) {
47391 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47392 ShuffleMask.append(EltSizeInBits, i + Offset);
47393 }
47394 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47395 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47396 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47397 // If we have register broadcast instructions, use the scalar size as the
47398 // element type for the shuffle. Then cast to the wider element type. The
47399 // widened bits won't be used, and this might allow the use of a broadcast
47400 // load.
47401 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47402 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47403 (NumElts * EltSizeInBits) / NumElts);
47404 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47405 } else {
47406 // For smaller scalar integers, we can simply any-extend it to the vector
47407 // element size (we don't care about the upper bits) and broadcast it to all
47408 // elements.
47409 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47410 }
47411
47412 // Now, mask the relevant bit in each element.
47414 for (unsigned i = 0; i != NumElts; ++i) {
47415 int BitIdx = (i % EltSizeInBits);
47416 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47417 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47418 }
47419 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47420 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47421
47422 // Compare against the bitmask and extend the result.
47423 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47424 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47425 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47426
47427 // For SEXT, this is now done, otherwise shift the result down for
47428 // zero-extension.
47429 if (Opcode == ISD::SIGN_EXTEND)
47430 return Vec;
47431 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47432 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47433}
47434
47435/// If both arms of a vector select are concatenated vectors, split the select,
47436/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47437/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47438/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47440 const X86Subtarget &Subtarget) {
47441 unsigned Opcode = N->getOpcode();
47442 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47443 return SDValue();
47444
47445 // TODO: Split 512-bit vectors too?
47446 EVT VT = N->getValueType(0);
47447 if (!VT.is256BitVector())
47448 return SDValue();
47449
47450 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47451 SDValue Cond = N->getOperand(0);
47452 SDValue TVal = N->getOperand(1);
47453 SDValue FVal = N->getOperand(2);
47454 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47455 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47456 return SDValue();
47457
47458 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47460 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47461 };
47462 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47463 /*CheckBWI*/ false);
47464}
47465
47467 const SDLoc &DL) {
47468 SDValue Cond = N->getOperand(0);
47469 SDValue LHS = N->getOperand(1);
47470 SDValue RHS = N->getOperand(2);
47471
47472 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47473 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47474 if (!TrueC || !FalseC)
47475 return SDValue();
47476
47477 // Don't do this for crazy integer types.
47478 EVT VT = N->getValueType(0);
47479 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47480 return SDValue();
47481
47482 // We're going to use the condition bit in math or logic ops. We could allow
47483 // this with a wider condition value (post-legalization it becomes an i8),
47484 // but if nothing is creating selects that late, it doesn't matter.
47485 if (Cond.getValueType() != MVT::i1)
47486 return SDValue();
47487
47488 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47489 // 3, 5, or 9 with i32/i64, so those get transformed too.
47490 // TODO: For constants that overflow or do not differ by power-of-2 or small
47491 // multiplier, convert to 'and' + 'add'.
47492 const APInt &TrueVal = TrueC->getAPIntValue();
47493 const APInt &FalseVal = FalseC->getAPIntValue();
47494
47495 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47496 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47497 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47498 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47499 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47500 return SDValue();
47501 }
47502
47503 bool OV;
47504 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47505 if (OV)
47506 return SDValue();
47507
47508 APInt AbsDiff = Diff.abs();
47509 if (AbsDiff.isPowerOf2() ||
47510 ((VT == MVT::i32 || VT == MVT::i64) &&
47511 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47512
47513 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47514 // of the condition can usually be folded into a compare predicate, but even
47515 // without that, the sequence should be cheaper than a CMOV alternative.
47516 if (TrueVal.slt(FalseVal)) {
47517 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47518 std::swap(TrueC, FalseC);
47519 }
47520
47521 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47522 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47523
47524 // Multiply condition by the difference if non-one.
47525 if (!AbsDiff.isOne())
47526 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47527
47528 // Add the base if non-zero.
47529 if (!FalseC->isZero())
47530 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47531
47532 return R;
47533 }
47534
47535 return SDValue();
47536}
47537
47538/// If this is a *dynamic* select (non-constant condition) and we can match
47539/// this node with one of the variable blend instructions, restructure the
47540/// condition so that blends can use the high (sign) bit of each element.
47541/// This function will also call SimplifyDemandedBits on already created
47542/// BLENDV to perform additional simplifications.
47544 const SDLoc &DL,
47546 const X86Subtarget &Subtarget) {
47547 SDValue Cond = N->getOperand(0);
47548 if ((N->getOpcode() != ISD::VSELECT &&
47549 N->getOpcode() != X86ISD::BLENDV) ||
47551 return SDValue();
47552
47553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47554 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47555 EVT VT = N->getValueType(0);
47556
47557 // We can only handle the cases where VSELECT is directly legal on the
47558 // subtarget. We custom lower VSELECT nodes with constant conditions and
47559 // this makes it hard to see whether a dynamic VSELECT will correctly
47560 // lower, so we both check the operation's status and explicitly handle the
47561 // cases where a *dynamic* blend will fail even though a constant-condition
47562 // blend could be custom lowered.
47563 // FIXME: We should find a better way to handle this class of problems.
47564 // Potentially, we should combine constant-condition vselect nodes
47565 // pre-legalization into shuffles and not mark as many types as custom
47566 // lowered.
47568 return SDValue();
47569 // FIXME: We don't support i16-element blends currently. We could and
47570 // should support them by making *all* the bits in the condition be set
47571 // rather than just the high bit and using an i8-element blend.
47572 if (VT.getVectorElementType() == MVT::i16)
47573 return SDValue();
47574 // Dynamic blending was only available from SSE4.1 onward.
47575 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47576 return SDValue();
47577 // Byte blends are only available in AVX2
47578 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47579 return SDValue();
47580 // There are no 512-bit blend instructions that use sign bits.
47581 if (VT.is512BitVector())
47582 return SDValue();
47583
47584 // Don't optimize before the condition has been transformed to a legal type
47585 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47587 return SDValue();
47588
47589 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47590 for (SDUse &Use : Cond->uses())
47591 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47592 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47593 Use.getOperandNo() != 0)
47594 return false;
47595
47596 return true;
47597 };
47598
47600
47601 if (OnlyUsedAsSelectCond(Cond)) {
47602 KnownBits Known;
47604 !DCI.isBeforeLegalizeOps());
47605 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47606 return SDValue();
47607
47608 // If we changed the computation somewhere in the DAG, this change will
47609 // affect all users of Cond. Update all the nodes so that we do not use
47610 // the generic VSELECT anymore. Otherwise, we may perform wrong
47611 // optimizations as we messed with the actual expectation for the vector
47612 // boolean values.
47613 for (SDNode *U : Cond->users()) {
47614 if (U->getOpcode() == X86ISD::BLENDV)
47615 continue;
47616
47617 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47618 Cond, U->getOperand(1), U->getOperand(2));
47619 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47620 DCI.AddToWorklist(U);
47621 }
47622 DCI.CommitTargetLoweringOpt(TLO);
47623 return SDValue(N, 0);
47624 }
47625
47626 // Otherwise we can still at least try to simplify multiple use bits.
47628 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47629 N->getOperand(1), N->getOperand(2));
47630
47631 return SDValue();
47632}
47633
47634// Try to match:
47635// (or (and (M, (sub 0, X)), (pandn M, X)))
47636// which is a special case of:
47637// (select M, (sub 0, X), X)
47638// Per:
47639// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47640// We know that, if fNegate is 0 or 1:
47641// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47642//
47643// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47644// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47645// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47646// This lets us transform our vselect to:
47647// (add (xor X, M), (and M, 1))
47648// And further to:
47649// (sub (xor X, M), M)
47651 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47652 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47653 using namespace SDPatternMatch;
47654 EVT MaskVT = Mask.getValueType();
47655 assert(MaskVT.isInteger() &&
47656 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47657 "Mask must be zero/all-bits");
47658
47659 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47661 return SDValue();
47662
47663 SDValue V;
47664 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47666 return SDValue();
47667
47668 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47669 SDValue SubOp2 = Mask;
47670
47671 // If the negate was on the false side of the select, then
47672 // the operands of the SUB need to be swapped. PR 27251.
47673 // This is because the pattern being matched above is
47674 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47675 // but if the pattern matched was
47676 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47677 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47678 // pattern also needs to be a negation of the replacement pattern above.
47679 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47680 // sub accomplishes the negation of the replacement pattern.
47681 if (V == Y)
47682 std::swap(SubOp1, SubOp2);
47683
47684 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47685 return DAG.getBitcast(VT, Res);
47686}
47687
47689 const X86Subtarget &Subtarget) {
47690 using namespace SDPatternMatch;
47691 if (!Subtarget.hasAVX512())
47692 return SDValue();
47693
47694 ISD::CondCode CC;
47695 SDValue Cond, X, Y, LHS, RHS;
47698 m_CondCode(CC)))),
47699 m_Value(LHS), m_Value(RHS))))
47700 return SDValue();
47701
47702 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47703 !canCombineAsMaskOperation(RHS, Subtarget))
47704 return SDValue();
47705
47706 // Commute LHS and RHS to create opportunity to select mask instruction.
47707 // (vselect M, L, R) -> (vselect ~M, R, L)
47708 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47709 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47710 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47711}
47712
47713/// Do target-specific dag combines on SELECT and VSELECT nodes.
47716 const X86Subtarget &Subtarget) {
47717 SDLoc DL(N);
47718 SDValue Cond = N->getOperand(0);
47719 SDValue LHS = N->getOperand(1);
47720 SDValue RHS = N->getOperand(2);
47721
47722 // Try simplification again because we use this function to optimize
47723 // BLENDV nodes that are not handled by the generic combiner.
47724 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47725 return V;
47726
47727 // When avx512 is available the lhs operand of select instruction can be
47728 // folded with mask instruction, while the rhs operand can't. Commute the
47729 // lhs and rhs of the select instruction to create the opportunity of
47730 // folding.
47731 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47732 return V;
47733
47734 EVT VT = LHS.getValueType();
47735 EVT CondVT = Cond.getValueType();
47736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47737 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47738
47739 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47740 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47741 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47742 if (CondVT.isVector() && CondVT.isInteger() &&
47743 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47744 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47747 DL, DAG, Subtarget))
47748 return V;
47749
47750 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47751 SmallVector<int, 64> CondMask;
47752 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47753 N->getOpcode() == X86ISD::BLENDV)) {
47754 // Convert vselects with constant condition into shuffles.
47755 if (DCI.isBeforeLegalizeOps())
47756 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47757
47758 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47759 // by forcing the unselected elements to zero.
47760 // TODO: Can we handle more shuffles with this?
47761 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47762 SmallVector<SDValue, 1> LHSOps, RHSOps;
47763 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47766 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47767 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47768 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47769 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47770 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47771 assert(ByteMask.size() == LHSMask.size() &&
47772 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47773 for (auto [I, M] : enumerate(ByteMask)) {
47774 // getConstVector sets negative shuffle mask values as undef, so
47775 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47776 if (M < (int)ByteMask.size()) {
47777 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47778 RHSMask[I] = 0x80;
47779 } else {
47780 LHSMask[I] = 0x80;
47781 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47782 }
47783 }
47784 MVT ByteVT = LHSShuf.getSimpleValueType();
47785 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47786 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47787 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47788 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47789 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47790 }
47791 }
47792
47793 // Attempt to combine as shuffle.
47794 SDValue Op(N, 0);
47795 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47796 return Res;
47797 }
47798 }
47799
47800 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47801 // instructions match the semantics of the common C idiom x<y?x:y but not
47802 // x<=y?x:y, because of how they handle negative zero (which can be
47803 // ignored in unsafe-math mode).
47804 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47805 if ((Cond.getOpcode() == ISD::SETCC ||
47806 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47807 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47808 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47809 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47810 (Subtarget.hasSSE2() ||
47811 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47812 bool IsStrict = Cond->isStrictFPOpcode();
47813 ISD::CondCode CC =
47814 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47815 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47816 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47817
47818 unsigned Opcode = 0;
47819 // Check for x CC y ? x : y.
47820 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47821 switch (CC) {
47822 default: break;
47823 case ISD::SETULT:
47824 // Converting this to a min would handle NaNs incorrectly, and swapping
47825 // the operands would cause it to handle comparisons between positive
47826 // and negative zero incorrectly.
47827 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47829 !(DAG.isKnownNeverZeroFloat(LHS) ||
47831 break;
47832 std::swap(LHS, RHS);
47833 }
47834 Opcode = X86ISD::FMIN;
47835 break;
47836 case ISD::SETOLE:
47837 // Converting this to a min would handle comparisons between positive
47838 // and negative zero incorrectly.
47841 break;
47842 Opcode = X86ISD::FMIN;
47843 break;
47844 case ISD::SETULE:
47845 // Converting this to a min would handle both negative zeros and NaNs
47846 // incorrectly, but we can swap the operands to fix both.
47847 std::swap(LHS, RHS);
47848 [[fallthrough]];
47849 case ISD::SETOLT:
47850 case ISD::SETLT:
47851 case ISD::SETLE:
47852 Opcode = X86ISD::FMIN;
47853 break;
47854
47855 case ISD::SETOGE:
47856 // Converting this to a max would handle comparisons between positive
47857 // and negative zero incorrectly.
47860 break;
47861 Opcode = X86ISD::FMAX;
47862 break;
47863 case ISD::SETUGT:
47864 // Converting this to a max would handle NaNs incorrectly, and swapping
47865 // the operands would cause it to handle comparisons between positive
47866 // and negative zero incorrectly.
47867 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47869 !(DAG.isKnownNeverZeroFloat(LHS) ||
47871 break;
47872 std::swap(LHS, RHS);
47873 }
47874 Opcode = X86ISD::FMAX;
47875 break;
47876 case ISD::SETUGE:
47877 // Converting this to a max would handle both negative zeros and NaNs
47878 // incorrectly, but we can swap the operands to fix both.
47879 std::swap(LHS, RHS);
47880 [[fallthrough]];
47881 case ISD::SETOGT:
47882 case ISD::SETGT:
47883 case ISD::SETGE:
47884 Opcode = X86ISD::FMAX;
47885 break;
47886 }
47887 // Check for x CC y ? y : x -- a min/max with reversed arms.
47888 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47889 switch (CC) {
47890 default: break;
47891 case ISD::SETOGE:
47892 // Converting this to a min would handle comparisons between positive
47893 // and negative zero incorrectly, and swapping the operands would
47894 // cause it to handle NaNs incorrectly.
47896 !(DAG.isKnownNeverZeroFloat(LHS) ||
47897 DAG.isKnownNeverZeroFloat(RHS))) {
47898 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47899 break;
47900 std::swap(LHS, RHS);
47901 }
47902 Opcode = X86ISD::FMIN;
47903 break;
47904 case ISD::SETUGT:
47905 // Converting this to a min would handle NaNs incorrectly.
47906 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47907 break;
47908 Opcode = X86ISD::FMIN;
47909 break;
47910 case ISD::SETUGE:
47911 // Converting this to a min would handle both negative zeros and NaNs
47912 // incorrectly, but we can swap the operands to fix both.
47913 std::swap(LHS, RHS);
47914 [[fallthrough]];
47915 case ISD::SETOGT:
47916 case ISD::SETGT:
47917 case ISD::SETGE:
47918 Opcode = X86ISD::FMIN;
47919 break;
47920
47921 case ISD::SETULT:
47922 // Converting this to a max would handle NaNs incorrectly.
47923 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47924 break;
47925 Opcode = X86ISD::FMAX;
47926 break;
47927 case ISD::SETOLE:
47928 // Converting this to a max would handle comparisons between positive
47929 // and negative zero incorrectly, and swapping the operands would
47930 // cause it to handle NaNs incorrectly.
47932 !DAG.isKnownNeverZeroFloat(LHS) &&
47933 !DAG.isKnownNeverZeroFloat(RHS)) {
47934 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47935 break;
47936 std::swap(LHS, RHS);
47937 }
47938 Opcode = X86ISD::FMAX;
47939 break;
47940 case ISD::SETULE:
47941 // Converting this to a max would handle both negative zeros and NaNs
47942 // incorrectly, but we can swap the operands to fix both.
47943 std::swap(LHS, RHS);
47944 [[fallthrough]];
47945 case ISD::SETOLT:
47946 case ISD::SETLT:
47947 case ISD::SETLE:
47948 Opcode = X86ISD::FMAX;
47949 break;
47950 }
47951 }
47952
47953 if (Opcode) {
47954 if (IsStrict) {
47955 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47957 DL, {N->getValueType(0), MVT::Other},
47958 {Cond.getOperand(0), LHS, RHS});
47959 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47960 return Ret;
47961 }
47962 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47963 }
47964 }
47965
47966 // Some mask scalar intrinsics rely on checking if only one bit is set
47967 // and implement it in C code like this:
47968 // A[0] = (U & 1) ? A[0] : W[0];
47969 // This creates some redundant instructions that break pattern matching.
47970 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47971 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47972 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47973 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47974 SDValue AndNode = Cond.getOperand(0);
47975 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47976 isNullConstant(Cond.getOperand(1)) &&
47977 isOneConstant(AndNode.getOperand(1))) {
47978 // LHS and RHS swapped due to
47979 // setcc outputting 1 when AND resulted in 0 and vice versa.
47980 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47981 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47982 }
47983 }
47984
47985 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47986 // lowering on KNL. In this case we convert it to
47987 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47988 // The same situation all vectors of i8 and i16 without BWI.
47989 // Make sure we extend these even before type legalization gets a chance to
47990 // split wide vectors.
47991 // Since SKX these selects have a proper lowering.
47992 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47993 CondVT.getVectorElementType() == MVT::i1 &&
47994 (VT.getVectorElementType() == MVT::i8 ||
47995 VT.getVectorElementType() == MVT::i16)) {
47996 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47997 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47998 }
47999
48000 // AVX512 - Extend select to merge with target shuffle.
48001 // select(mask, extract_subvector(shuffle(x)), y) -->
48002 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
48003 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
48004 if (Subtarget.hasAVX512() && CondVT.isVector() &&
48005 CondVT.getVectorElementType() == MVT::i1) {
48006 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48007 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48008 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48009 isNullConstant(Op.getOperand(1)) &&
48010 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48011 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48012 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48013 ISD::isBuildVectorAllZeros(Alt.getNode()));
48014 };
48015
48016 bool SelectableLHS = SelectableOp(LHS, RHS);
48017 bool SelectableRHS = SelectableOp(RHS, LHS);
48018 if (SelectableLHS || SelectableRHS) {
48019 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48020 : RHS.getOperand(0).getValueType();
48021 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48022 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48023 VT.getSizeInBits());
48024 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48025 VT.getSizeInBits());
48026 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48027 DAG.getUNDEF(SrcCondVT), Cond,
48028 DAG.getVectorIdxConstant(0, DL));
48029 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48030 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48031 }
48032 }
48033
48034 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48035 return V;
48036
48037 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48038 Cond.hasOneUse()) {
48039 EVT CondVT = Cond.getValueType();
48040 SDValue Cond0 = Cond.getOperand(0);
48041 SDValue Cond1 = Cond.getOperand(1);
48042 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48043
48044 // Canonicalize min/max:
48045 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48046 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48047 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48048 // the need for an extra compare against zero. e.g.
48049 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48050 // subl %esi, %edi
48051 // testl %edi, %edi
48052 // movl $0, %eax
48053 // cmovgl %edi, %eax
48054 // =>
48055 // xorl %eax, %eax
48056 // subl %esi, $edi
48057 // cmovsl %eax, %edi
48058 //
48059 // We can also canonicalize
48060 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48061 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48062 // This allows the use of a test instruction for the compare.
48063 if (LHS == Cond0 && RHS == Cond1) {
48064 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48065 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48067 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48068 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48069 }
48070 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48071 ISD::CondCode NewCC = ISD::SETUGE;
48072 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48073 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48074 }
48075 }
48076
48077 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48078 // fold eq + gt/lt nested selects into ge/le selects
48079 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48080 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48081 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48082 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48083 // .. etc ..
48084 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48085 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48086 SDValue InnerSetCC = RHS.getOperand(0);
48087 ISD::CondCode InnerCC =
48088 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48089 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48090 Cond0 == InnerSetCC.getOperand(0) &&
48091 Cond1 == InnerSetCC.getOperand(1)) {
48092 ISD::CondCode NewCC;
48093 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48094 // clang-format off
48095 case ISD::SETGT: NewCC = ISD::SETGE; break;
48096 case ISD::SETLT: NewCC = ISD::SETLE; break;
48097 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48098 case ISD::SETULT: NewCC = ISD::SETULE; break;
48099 default: NewCC = ISD::SETCC_INVALID; break;
48100 // clang-format on
48101 }
48102 if (NewCC != ISD::SETCC_INVALID) {
48103 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48104 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48105 }
48106 }
48107 }
48108 }
48109
48110 // Check if the first operand is all zeros and Cond type is vXi1.
48111 // If this an avx512 target we can improve the use of zero masking by
48112 // swapping the operands and inverting the condition.
48113 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48114 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48115 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48116 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48117 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48118 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48119 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48120 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48121 }
48122
48123 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48124 // get split by legalization.
48125 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48126 CondVT.getVectorElementType() == MVT::i1 &&
48127 TLI.isTypeLegal(VT.getScalarType())) {
48128 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48130 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48131 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48132 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48133 }
48134 }
48135
48136 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48137 // with out-of-bounds clamping.
48138
48139 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48140 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48141 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48142 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48143 // exceeding bitwidth-1.
48144 if (N->getOpcode() == ISD::VSELECT) {
48145 using namespace llvm::SDPatternMatch;
48146 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48147 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48148 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48149 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48151 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48154 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48155 : X86ISD::VSHLV,
48156 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48157 }
48158 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48159 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48160 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48161 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48163 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48166 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48167 : X86ISD::VSHLV,
48168 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48169 }
48170 }
48171
48172 // Early exit check
48173 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48174 return SDValue();
48175
48176 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48177 return V;
48178
48179 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48180 return V;
48181
48182 // select(~Cond, X, Y) -> select(Cond, Y, X)
48183 if (CondVT.getScalarType() != MVT::i1) {
48184 if (SDValue CondNot = IsNOT(Cond, DAG))
48185 return DAG.getNode(N->getOpcode(), DL, VT,
48186 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48187
48188 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48189 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48190 Cond.getOperand(0).getOpcode() == ISD::AND &&
48191 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48192 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48193 Cond.getScalarValueSizeInBits(),
48194 /*AllowUndefs=*/true) &&
48195 Cond.hasOneUse()) {
48196 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48197 Cond.getOperand(0).getOperand(1));
48198 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48199 }
48200
48201 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48202 // signbit.
48203 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48204 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48205 Cond.hasOneUse()) {
48206 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48207 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48208 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48209 }
48210 }
48211
48212 // Try to optimize vXi1 selects if both operands are either all constants or
48213 // bitcasts from scalar integer type. In that case we can convert the operands
48214 // to integer and use an integer select which will be converted to a CMOV.
48215 // We need to take a little bit of care to avoid creating an i64 type after
48216 // type legalization.
48217 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48218 VT.getVectorElementType() == MVT::i1 &&
48219 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48221 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48222 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48223 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48224
48225 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48226 LHS.getOperand(0).getValueType() == IntVT)) &&
48227 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48228 RHS.getOperand(0).getValueType() == IntVT))) {
48229 if (LHSIsConst)
48231 else
48232 LHS = LHS.getOperand(0);
48233
48234 if (RHSIsConst)
48236 else
48237 RHS = RHS.getOperand(0);
48238
48239 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48240 return DAG.getBitcast(VT, Select);
48241 }
48242 }
48243 }
48244
48245 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48246 // single bits, then invert the predicate and swap the select operands.
48247 // This can lower using a vector shift bit-hack rather than mask and compare.
48248 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48249 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48250 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48251 Cond.getOperand(0).getOpcode() == ISD::AND &&
48252 isNullOrNullSplat(Cond.getOperand(1)) &&
48253 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48254 Cond.getOperand(0).getValueType() == VT) {
48255 // The 'and' mask must be composed of power-of-2 constants.
48256 SDValue And = Cond.getOperand(0);
48257 auto *C = isConstOrConstSplat(And.getOperand(1));
48258 if (C && C->getAPIntValue().isPowerOf2()) {
48259 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48260 SDValue NotCond =
48261 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48262 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48263 }
48264
48265 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48266 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48267 // 16-bit lacks a proper blendv.
48268 unsigned EltBitWidth = VT.getScalarSizeInBits();
48269 bool CanShiftBlend =
48270 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48271 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48272 (Subtarget.hasXOP()));
48273 if (CanShiftBlend &&
48274 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48275 return C->getAPIntValue().isPowerOf2();
48276 })) {
48277 // Create a left-shift constant to get the mask bits over to the sign-bit.
48278 SDValue Mask = And.getOperand(1);
48279 SmallVector<int, 32> ShlVals;
48280 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48281 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48282 ShlVals.push_back(EltBitWidth - 1 -
48283 MaskVal->getAPIntValue().exactLogBase2());
48284 }
48285 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48286 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48287 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48288 SDValue NewCond =
48289 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48290 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48291 }
48292 }
48293
48294 return SDValue();
48295}
48296
48297/// Combine:
48298/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48299/// to:
48300/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48301/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48302/// Note that this is only legal for some op/cc combinations.
48304 SelectionDAG &DAG,
48305 const X86Subtarget &Subtarget) {
48306 // This combine only operates on CMP-like nodes.
48307 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48308 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48309 return SDValue();
48310
48311 // Can't replace the cmp if it has more uses than the one we're looking at.
48312 // FIXME: We would like to be able to handle this, but would need to make sure
48313 // all uses were updated.
48314 if (!Cmp.hasOneUse())
48315 return SDValue();
48316
48317 // This only applies to variations of the common case:
48318 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48319 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48320 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48321 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48322 // Using the proper condcodes (see below), overflow is checked for.
48323
48324 // FIXME: We can generalize both constraints:
48325 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48326 // - LHS != 1
48327 // if the result is compared.
48328
48329 SDValue CmpLHS = Cmp.getOperand(0);
48330 SDValue CmpRHS = Cmp.getOperand(1);
48331 EVT CmpVT = CmpLHS.getValueType();
48332
48333 if (!CmpLHS.hasOneUse())
48334 return SDValue();
48335
48336 unsigned Opc = CmpLHS.getOpcode();
48337 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48338 return SDValue();
48339
48340 SDValue OpRHS = CmpLHS.getOperand(2);
48341 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48342 if (!OpRHSC)
48343 return SDValue();
48344
48345 APInt Addend = OpRHSC->getAPIntValue();
48346 if (Opc == ISD::ATOMIC_LOAD_SUB)
48347 Addend = -Addend;
48348
48349 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48350 if (!CmpRHSC)
48351 return SDValue();
48352
48353 APInt Comparison = CmpRHSC->getAPIntValue();
48354 APInt NegAddend = -Addend;
48355
48356 // See if we can adjust the CC to make the comparison match the negated
48357 // addend.
48358 if (Comparison != NegAddend) {
48359 APInt IncComparison = Comparison + 1;
48360 if (IncComparison == NegAddend) {
48361 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48362 Comparison = IncComparison;
48363 CC = X86::COND_AE;
48364 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48365 Comparison = IncComparison;
48366 CC = X86::COND_L;
48367 }
48368 }
48369 APInt DecComparison = Comparison - 1;
48370 if (DecComparison == NegAddend) {
48371 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48372 Comparison = DecComparison;
48373 CC = X86::COND_A;
48374 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48375 Comparison = DecComparison;
48376 CC = X86::COND_LE;
48377 }
48378 }
48379 }
48380
48381 // If the addend is the negation of the comparison value, then we can do
48382 // a full comparison by emitting the atomic arithmetic as a locked sub.
48383 if (Comparison == NegAddend) {
48384 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48385 // atomic sub.
48386 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48387 auto AtomicSub = DAG.getAtomic(
48388 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48389 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48390 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48391 AN->getMemOperand());
48392 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48393 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48394 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48395 return LockOp;
48396 }
48397
48398 // We can handle comparisons with zero in a number of cases by manipulating
48399 // the CC used.
48400 if (!Comparison.isZero())
48401 return SDValue();
48402
48403 if (CC == X86::COND_S && Addend == 1)
48404 CC = X86::COND_LE;
48405 else if (CC == X86::COND_NS && Addend == 1)
48406 CC = X86::COND_G;
48407 else if (CC == X86::COND_G && Addend == -1)
48408 CC = X86::COND_GE;
48409 else if (CC == X86::COND_LE && Addend == -1)
48410 CC = X86::COND_L;
48411 else
48412 return SDValue();
48413
48414 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48415 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48416 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48417 return LockOp;
48418}
48419
48420// Check whether we're just testing the signbit, and whether we can simplify
48421// this by tracking where the signbit came from.
48423 SelectionDAG &DAG) {
48424 if (CC != X86::COND_S && CC != X86::COND_NS)
48425 return SDValue();
48426
48427 if (!Cmp.hasOneUse())
48428 return SDValue();
48429
48430 SDValue Src;
48431 if (Cmp.getOpcode() == X86ISD::CMP) {
48432 // CMP(X,0) -> signbit test
48433 if (!isNullConstant(Cmp.getOperand(1)))
48434 return SDValue();
48435 Src = Cmp.getOperand(0);
48436 // Peek through a SRA node as we just need the signbit.
48437 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48438 // TODO: Use SimplifyDemandedBits instead of just SRA?
48439 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48440 return SDValue();
48441 Src = Src.getOperand(0);
48442 } else if (Cmp.getOpcode() == X86ISD::OR) {
48443 // OR(X,Y) -> see if only one operand contributes to the signbit.
48444 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48445 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48446 Src = Cmp.getOperand(1);
48447 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48448 Src = Cmp.getOperand(0);
48449 else
48450 return SDValue();
48451 } else {
48452 return SDValue();
48453 }
48454
48455 // Replace with a TEST on the MSB.
48456 SDLoc DL(Cmp);
48457 MVT SrcVT = Src.getSimpleValueType();
48458 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48459
48460 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48461 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48462 if (Src.getOpcode() == ISD::SHL) {
48463 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48464 Src = Src.getOperand(0);
48465 BitMask.lshrInPlace(*ShiftAmt);
48466 }
48467 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48468 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48469 Src = Src.getOperand(0);
48470 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48471 }
48472
48473 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48474 DAG.getConstant(BitMask, DL, SrcVT));
48475 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48476 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48477 DAG.getConstant(0, DL, SrcVT));
48478}
48479
48480// Check whether a boolean test is testing a boolean value generated by
48481// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48482// code.
48483//
48484// Simplify the following patterns:
48485// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48486// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48487// to (Op EFLAGS Cond)
48488//
48489// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48490// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48491// to (Op EFLAGS !Cond)
48492//
48493// where Op could be BRCOND or CMOV.
48494//
48496 // This combine only operates on CMP-like nodes.
48497 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48498 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48499 return SDValue();
48500
48501 // Quit if not used as a boolean value.
48502 if (CC != X86::COND_E && CC != X86::COND_NE)
48503 return SDValue();
48504
48505 // Check CMP operands. One of them should be 0 or 1 and the other should be
48506 // an SetCC or extended from it.
48507 SDValue Op1 = Cmp.getOperand(0);
48508 SDValue Op2 = Cmp.getOperand(1);
48509
48510 SDValue SetCC;
48511 const ConstantSDNode* C = nullptr;
48512 bool needOppositeCond = (CC == X86::COND_E);
48513 bool checkAgainstTrue = false; // Is it a comparison against 1?
48514
48515 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48516 SetCC = Op2;
48517 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48518 SetCC = Op1;
48519 else // Quit if all operands are not constants.
48520 return SDValue();
48521
48522 if (C->getZExtValue() == 1) {
48523 needOppositeCond = !needOppositeCond;
48524 checkAgainstTrue = true;
48525 } else if (C->getZExtValue() != 0)
48526 // Quit if the constant is neither 0 or 1.
48527 return SDValue();
48528
48529 bool truncatedToBoolWithAnd = false;
48530 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48531 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48532 SetCC.getOpcode() == ISD::TRUNCATE ||
48533 SetCC.getOpcode() == ISD::AND) {
48534 if (SetCC.getOpcode() == ISD::AND) {
48535 int OpIdx = -1;
48536 if (isOneConstant(SetCC.getOperand(0)))
48537 OpIdx = 1;
48538 if (isOneConstant(SetCC.getOperand(1)))
48539 OpIdx = 0;
48540 if (OpIdx < 0)
48541 break;
48542 SetCC = SetCC.getOperand(OpIdx);
48543 truncatedToBoolWithAnd = true;
48544 } else
48545 SetCC = SetCC.getOperand(0);
48546 }
48547
48548 switch (SetCC.getOpcode()) {
48550 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48551 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48552 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48553 // truncated to i1 using 'and'.
48554 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48555 break;
48557 "Invalid use of SETCC_CARRY!");
48558 [[fallthrough]];
48559 case X86ISD::SETCC:
48560 // Set the condition code or opposite one if necessary.
48561 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48562 if (needOppositeCond)
48564 return SetCC.getOperand(1);
48565 case X86ISD::CMOV: {
48566 // Check whether false/true value has canonical one, i.e. 0 or 1.
48569 // Quit if true value is not a constant.
48570 if (!TVal)
48571 return SDValue();
48572 // Quit if false value is not a constant.
48573 if (!FVal) {
48574 SDValue Op = SetCC.getOperand(0);
48575 // Skip 'zext' or 'trunc' node.
48576 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48577 Op.getOpcode() == ISD::TRUNCATE)
48578 Op = Op.getOperand(0);
48579 // A special case for rdrand/rdseed, where 0 is set if false cond is
48580 // found.
48581 if ((Op.getOpcode() != X86ISD::RDRAND &&
48582 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48583 return SDValue();
48584 }
48585 // Quit if false value is not the constant 0 or 1.
48586 bool FValIsFalse = true;
48587 if (FVal && FVal->getZExtValue() != 0) {
48588 if (FVal->getZExtValue() != 1)
48589 return SDValue();
48590 // If FVal is 1, opposite cond is needed.
48591 needOppositeCond = !needOppositeCond;
48592 FValIsFalse = false;
48593 }
48594 // Quit if TVal is not the constant opposite of FVal.
48595 if (FValIsFalse && TVal->getZExtValue() != 1)
48596 return SDValue();
48597 if (!FValIsFalse && TVal->getZExtValue() != 0)
48598 return SDValue();
48599 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48600 if (needOppositeCond)
48602 return SetCC.getOperand(3);
48603 }
48604 }
48605
48606 return SDValue();
48607}
48608
48609/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48610/// Match:
48611/// (X86or (X86setcc) (X86setcc))
48612/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48614 X86::CondCode &CC1, SDValue &Flags,
48615 bool &isAnd) {
48616 if (Cond->getOpcode() == X86ISD::CMP) {
48617 if (!isNullConstant(Cond->getOperand(1)))
48618 return false;
48619
48620 Cond = Cond->getOperand(0);
48621 }
48622
48623 isAnd = false;
48624
48625 SDValue SetCC0, SetCC1;
48626 switch (Cond->getOpcode()) {
48627 default: return false;
48628 case ISD::AND:
48629 case X86ISD::AND:
48630 isAnd = true;
48631 [[fallthrough]];
48632 case ISD::OR:
48633 case X86ISD::OR:
48634 SetCC0 = Cond->getOperand(0);
48635 SetCC1 = Cond->getOperand(1);
48636 break;
48637 };
48638
48639 // Make sure we have SETCC nodes, using the same flags value.
48640 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48641 SetCC1.getOpcode() != X86ISD::SETCC ||
48642 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48643 return false;
48644
48645 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48646 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48647 Flags = SetCC0->getOperand(1);
48648 return true;
48649}
48650
48651// When legalizing carry, we create carries via add X, -1
48652// If that comes from an actual carry, via setcc, we use the
48653// carry directly.
48655 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48656 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48657 bool FoundAndLSB = false;
48658 SDValue Carry = EFLAGS.getOperand(0);
48659 while (Carry.getOpcode() == ISD::TRUNCATE ||
48660 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48661 (Carry.getOpcode() == ISD::AND &&
48662 isOneConstant(Carry.getOperand(1)))) {
48663 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48664 Carry = Carry.getOperand(0);
48665 }
48666 if (Carry.getOpcode() == X86ISD::SETCC ||
48667 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48668 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48669 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48670 SDValue CarryOp1 = Carry.getOperand(1);
48671 if (CarryCC == X86::COND_B)
48672 return CarryOp1;
48673 if (CarryCC == X86::COND_A) {
48674 // Try to convert COND_A into COND_B in an attempt to facilitate
48675 // materializing "setb reg".
48676 //
48677 // Do not flip "e > c", where "c" is a constant, because Cmp
48678 // instruction cannot take an immediate as its first operand.
48679 //
48680 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48681 CarryOp1.getNode()->hasOneUse() &&
48682 CarryOp1.getValueType().isInteger() &&
48683 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48684 SDValue SubCommute =
48685 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48686 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48687 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48688 }
48689 }
48690 // If this is a check of the z flag of an add with 1, switch to the
48691 // C flag.
48692 if (CarryCC == X86::COND_E &&
48693 CarryOp1.getOpcode() == X86ISD::ADD &&
48694 isOneConstant(CarryOp1.getOperand(1)))
48695 return CarryOp1;
48696 } else if (FoundAndLSB) {
48697 SDLoc DL(Carry);
48698 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48699 if (Carry.getOpcode() == ISD::SRL) {
48700 BitNo = Carry.getOperand(1);
48701 Carry = Carry.getOperand(0);
48702 }
48703 return getBT(Carry, BitNo, DL, DAG);
48704 }
48705 }
48706 }
48707
48708 return SDValue();
48709}
48710
48711/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48712/// to avoid the inversion.
48714 SelectionDAG &DAG,
48715 const X86Subtarget &Subtarget) {
48716 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48717 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48718 EFLAGS.getOpcode() != X86ISD::TESTP)
48719 return SDValue();
48720
48721 // PTEST/TESTP sets EFLAGS as:
48722 // TESTZ: ZF = (Op0 & Op1) == 0
48723 // TESTC: CF = (~Op0 & Op1) == 0
48724 // TESTNZC: ZF == 0 && CF == 0
48725 MVT VT = EFLAGS.getSimpleValueType();
48726 SDValue Op0 = EFLAGS.getOperand(0);
48727 SDValue Op1 = EFLAGS.getOperand(1);
48728 MVT OpVT = Op0.getSimpleValueType();
48729 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48730
48731 // TEST*(~X,Y) == TEST*(X,Y)
48732 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48733 X86::CondCode InvCC;
48734 switch (CC) {
48735 case X86::COND_B:
48736 // testc -> testz.
48737 InvCC = X86::COND_E;
48738 break;
48739 case X86::COND_AE:
48740 // !testc -> !testz.
48741 InvCC = X86::COND_NE;
48742 break;
48743 case X86::COND_E:
48744 // testz -> testc.
48745 InvCC = X86::COND_B;
48746 break;
48747 case X86::COND_NE:
48748 // !testz -> !testc.
48749 InvCC = X86::COND_AE;
48750 break;
48751 case X86::COND_A:
48752 case X86::COND_BE:
48753 // testnzc -> testnzc (no change).
48754 InvCC = CC;
48755 break;
48756 default:
48757 InvCC = X86::COND_INVALID;
48758 break;
48759 }
48760
48761 if (InvCC != X86::COND_INVALID) {
48762 CC = InvCC;
48763 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48764 DAG.getBitcast(OpVT, NotOp0), Op1);
48765 }
48766 }
48767
48768 if (CC == X86::COND_B || CC == X86::COND_AE) {
48769 // TESTC(X,~X) == TESTC(X,-1)
48770 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48771 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48772 SDLoc DL(EFLAGS);
48773 return DAG.getNode(
48774 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48775 DAG.getBitcast(OpVT,
48776 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48777 }
48778 }
48779 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48780 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48782 SDValue BC0 = peekThroughBitcasts(Op0);
48783 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48785 SDLoc DL(EFLAGS);
48786 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48787 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48788 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48789 }
48790 }
48791 }
48792
48793 if (CC == X86::COND_E || CC == X86::COND_NE) {
48794 // TESTZ(X,~Y) == TESTC(Y,X)
48795 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48796 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48797 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48798 DAG.getBitcast(OpVT, NotOp1), Op0);
48799 }
48800
48801 if (Op0 == Op1) {
48802 SDValue BC = peekThroughBitcasts(Op0);
48803 EVT BCVT = BC.getValueType();
48804
48805 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48806 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48807 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48808 DAG.getBitcast(OpVT, BC.getOperand(0)),
48809 DAG.getBitcast(OpVT, BC.getOperand(1)));
48810 }
48811
48812 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48813 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48814 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48815 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48816 DAG.getBitcast(OpVT, BC.getOperand(0)),
48817 DAG.getBitcast(OpVT, BC.getOperand(1)));
48818 }
48819
48820 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48821 // to more efficiently extract the sign bits and compare that.
48822 // TODO: Handle TESTC with comparison inversion.
48823 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48824 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48825 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48826 unsigned EltBits = BCVT.getScalarSizeInBits();
48827 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48828 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48829 APInt SignMask = APInt::getSignMask(EltBits);
48830 if (SDValue Res =
48831 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48832 // For vXi16 cases we need to use pmovmksb and extract every other
48833 // sign bit.
48834 SDLoc DL(EFLAGS);
48835 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48836 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48837 MVT FloatVT =
48838 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48839 Res = DAG.getBitcast(FloatVT, Res);
48840 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48841 } else if (EltBits == 16) {
48842 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48843 Res = DAG.getBitcast(MovmskVT, Res);
48844 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48845 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48846 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48847 } else {
48848 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48849 }
48850 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48851 DAG.getConstant(0, DL, MVT::i32));
48852 }
48853 }
48854 }
48855 }
48856
48857 // TESTZ(-1,X) == TESTZ(X,X)
48859 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48860
48861 // TESTZ(X,-1) == TESTZ(X,X)
48863 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48864
48865 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48866 // TODO: Add COND_NE handling?
48867 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48868 SDValue Src0 = peekThroughBitcasts(Op0);
48869 SDValue Src1 = peekThroughBitcasts(Op1);
48870 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48872 peekThroughBitcasts(Src0.getOperand(1)), true);
48874 peekThroughBitcasts(Src1.getOperand(1)), true);
48875 if (Src0 && Src1) {
48876 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48877 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48878 DAG.getBitcast(OpVT2, Src0),
48879 DAG.getBitcast(OpVT2, Src1));
48880 }
48881 }
48882 }
48883 }
48884
48885 return SDValue();
48886}
48887
48888// Attempt to simplify the MOVMSK input based on the comparison type.
48890 SelectionDAG &DAG,
48891 const X86Subtarget &Subtarget) {
48892 // Handle eq/ne against zero (any_of).
48893 // Handle eq/ne against -1 (all_of).
48894 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48895 return SDValue();
48896 if (EFLAGS.getValueType() != MVT::i32)
48897 return SDValue();
48898 unsigned CmpOpcode = EFLAGS.getOpcode();
48899 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48900 return SDValue();
48901 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48902 if (!CmpConstant)
48903 return SDValue();
48904 const APInt &CmpVal = CmpConstant->getAPIntValue();
48905
48906 SDValue CmpOp = EFLAGS.getOperand(0);
48907 unsigned CmpBits = CmpOp.getValueSizeInBits();
48908 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48909
48910 // Peek through any truncate.
48911 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48912 CmpOp = CmpOp.getOperand(0);
48913
48914 // Bail if we don't find a MOVMSK.
48915 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48916 return SDValue();
48917
48918 SDValue Vec = CmpOp.getOperand(0);
48919 MVT VecVT = Vec.getSimpleValueType();
48920 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48921 "Unexpected MOVMSK operand");
48922 unsigned NumElts = VecVT.getVectorNumElements();
48923 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48924
48925 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48926 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48927 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48928 if (!IsAnyOf && !IsAllOf)
48929 return SDValue();
48930
48931 // TODO: Check more combining cases for me.
48932 // Here we check the cmp use number to decide do combining or not.
48933 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48934 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48935 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48936
48937 // See if we can peek through to a vector with a wider element type, if the
48938 // signbits extend down to all the sub-elements as well.
48939 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48940 // potential SimplifyDemandedBits/Elts cases.
48941 // If we looked through a truncate that discard bits, we can't do this
48942 // transform.
48943 // FIXME: We could do this transform for truncates that discarded bits by
48944 // inserting an AND mask between the new MOVMSK and the CMP.
48945 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48946 SDValue BC = peekThroughBitcasts(Vec);
48947 MVT BCVT = BC.getSimpleValueType();
48948 unsigned BCNumElts = BCVT.getVectorNumElements();
48949 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48950 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48951 BCNumEltBits > NumEltBits &&
48952 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48953 SDLoc DL(EFLAGS);
48954 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48955 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48956 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48957 DAG.getConstant(CmpMask, DL, MVT::i32));
48958 }
48959 }
48960
48961 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48962 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48963 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48964 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48965 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48967 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48968 Ops.size() == 2) {
48969 SDLoc DL(EFLAGS);
48970 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48971 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48972 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48973 DAG.getBitcast(SubVT, Ops[0]),
48974 DAG.getBitcast(SubVT, Ops[1]));
48975 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48976 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48977 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48978 DAG.getConstant(CmpMask, DL, MVT::i32));
48979 }
48980 }
48981
48982 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48983 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48984 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48985 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48986 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48987 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48988 SDValue BC = peekThroughBitcasts(Vec);
48989 // Ensure MOVMSK was testing every signbit of BC.
48990 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48991 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48992 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48993 BC.getOperand(0), BC.getOperand(1));
48994 V = DAG.getBitcast(TestVT, V);
48995 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48996 }
48997 // Check for 256-bit split vector cases.
48998 if (BC.getOpcode() == ISD::AND &&
48999 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
49000 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
49001 SDValue LHS = BC.getOperand(0);
49002 SDValue RHS = BC.getOperand(1);
49003 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
49004 LHS.getOperand(0), LHS.getOperand(1));
49005 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49006 RHS.getOperand(0), RHS.getOperand(1));
49007 LHS = DAG.getBitcast(TestVT, LHS);
49008 RHS = DAG.getBitcast(TestVT, RHS);
49009 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49010 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49011 }
49012 }
49013 }
49014
49015 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49016 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49017 // sign bits prior to the comparison with zero unless we know that
49018 // the vXi16 splats the sign bit down to the lower i8 half.
49019 // TODO: Handle all_of patterns.
49020 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49021 SDValue VecOp0 = Vec.getOperand(0);
49022 SDValue VecOp1 = Vec.getOperand(1);
49023 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49024 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49025 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49026 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49027 SDLoc DL(EFLAGS);
49028 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49029 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49030 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49031 if (!SignExt0) {
49032 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49033 DAG.getConstant(0xAAAA, DL, MVT::i16));
49034 }
49035 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49036 DAG.getConstant(0, DL, MVT::i16));
49037 }
49038 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49039 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49040 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49041 (IsAnyOf || (SignExt0 && SignExt1))) {
49042 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49043 SDLoc DL(EFLAGS);
49044 SDValue Result = peekThroughBitcasts(Src);
49045 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49046 Result.getValueType().getVectorNumElements() <= NumElts) {
49047 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49048 Result.getOperand(0), Result.getOperand(1));
49049 V = DAG.getBitcast(MVT::v4i64, V);
49050 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49051 }
49052 Result = DAG.getBitcast(MVT::v32i8, Result);
49053 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49054 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49055 if (!SignExt0 || !SignExt1) {
49056 assert(IsAnyOf &&
49057 "Only perform v16i16 signmasks for any_of patterns");
49058 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49059 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49060 }
49061 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49062 DAG.getConstant(CmpMask, DL, MVT::i32));
49063 }
49064 }
49065 }
49066
49067 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49068 // Since we peek through a bitcast, we need to be careful if the base vector
49069 // type has smaller elements than the MOVMSK type. In that case, even if
49070 // all the elements are demanded by the shuffle mask, only the "high"
49071 // elements which have highbits that align with highbits in the MOVMSK vec
49072 // elements are actually demanded. A simplification of spurious operations
49073 // on the "low" elements take place during other simplifications.
49074 //
49075 // For example:
49076 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49077 // demanded, because we are swapping around the result can change.
49078 //
49079 // To address this, we check that we can scale the shuffle mask to MOVMSK
49080 // element width (this will ensure "high" elements match). Its slightly overly
49081 // conservative, but fine for an edge case fold.
49082 SmallVector<int, 32> ShuffleMask;
49083 SmallVector<SDValue, 2> ShuffleInputs;
49084 if (NumElts <= CmpBits &&
49085 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49086 ShuffleMask, DAG) &&
49087 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49088 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49089 canScaleShuffleElements(ShuffleMask, NumElts)) {
49090 SDLoc DL(EFLAGS);
49091 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49092 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49093 Result =
49094 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49095 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49096 }
49097
49098 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49099 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49100 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49101 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49102 // iff every element is referenced.
49103 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49104 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49105 (NumEltBits == 32 || NumEltBits == 64)) {
49106 SDLoc DL(EFLAGS);
49107 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49108 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49109 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49110 SDValue LHS = Vec;
49111 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49112 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49113 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49114 DAG.getBitcast(FloatVT, LHS),
49115 DAG.getBitcast(FloatVT, RHS));
49116 }
49117
49118 return SDValue();
49119}
49120
49121/// Optimize an EFLAGS definition used according to the condition code \p CC
49122/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49123/// uses of chain values.
49125 SelectionDAG &DAG,
49126 const X86Subtarget &Subtarget) {
49127 if (CC == X86::COND_B)
49128 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49129 return Flags;
49130
49131 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49132 return R;
49133
49134 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49135 return R;
49136
49137 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49138 return R;
49139
49140 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49141 return R;
49142
49143 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49144}
49145
49146/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49149 const X86Subtarget &Subtarget) {
49150 SDLoc DL(N);
49151 EVT VT = N->getValueType(0);
49152 SDValue FalseOp = N->getOperand(0);
49153 SDValue TrueOp = N->getOperand(1);
49154 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49155 SDValue Cond = N->getOperand(3);
49156
49157 // cmov X, X, ?, ? --> X
49158 if (TrueOp == FalseOp)
49159 return TrueOp;
49160
49161 // Try to simplify the EFLAGS and condition code operands.
49162 // We can't always do this as FCMOV only supports a subset of X86 cond.
49163 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49164 if (!(FalseOp.getValueType() == MVT::f80 ||
49165 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49166 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49167 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49168 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49169 Flags};
49170 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49171 }
49172 }
49173
49174 // If this is a select between two integer constants, try to do some
49175 // optimizations. Note that the operands are ordered the opposite of SELECT
49176 // operands.
49177 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49178 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49179 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49180 // larger than FalseC (the false value).
49181 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49183 std::swap(TrueC, FalseC);
49184 std::swap(TrueOp, FalseOp);
49185 }
49186
49187 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49188 // This is efficient for any integer data type (including i8/i16) and
49189 // shift amount.
49190 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49191 Cond = getSETCC(CC, Cond, DL, DAG);
49192
49193 // Zero extend the condition if needed.
49194 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49195
49196 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49197 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49198 DAG.getConstant(ShAmt, DL, MVT::i8));
49199 return Cond;
49200 }
49201
49202 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49203 // for any integer data type, including i8/i16.
49204 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49205 Cond = getSETCC(CC, Cond, DL, DAG);
49206
49207 // Zero extend the condition if needed.
49209 FalseC->getValueType(0), Cond);
49210 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49211 SDValue(FalseC, 0));
49212 return Cond;
49213 }
49214
49215 // Optimize cases that will turn into an LEA instruction. This requires
49216 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49217 if (VT == MVT::i32 || VT == MVT::i64) {
49218 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49219 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49220 "Implicit constant truncation");
49221
49222 bool isFastMultiplier = false;
49223 if (Diff.ult(10)) {
49224 switch (Diff.getZExtValue()) {
49225 default: break;
49226 case 1: // result = add base, cond
49227 case 2: // result = lea base( , cond*2)
49228 case 3: // result = lea base(cond, cond*2)
49229 case 4: // result = lea base( , cond*4)
49230 case 5: // result = lea base(cond, cond*4)
49231 case 8: // result = lea base( , cond*8)
49232 case 9: // result = lea base(cond, cond*8)
49233 isFastMultiplier = true;
49234 break;
49235 }
49236 }
49237
49238 if (isFastMultiplier) {
49239 Cond = getSETCC(CC, Cond, DL ,DAG);
49240 // Zero extend the condition if needed.
49241 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49242 Cond);
49243 // Scale the condition by the difference.
49244 if (Diff != 1)
49245 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49246 DAG.getConstant(Diff, DL, Cond.getValueType()));
49247
49248 // Add the base if non-zero.
49249 if (FalseC->getAPIntValue() != 0)
49250 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49251 SDValue(FalseC, 0));
49252 return Cond;
49253 }
49254 }
49255 }
49256 }
49257
49258 // Handle these cases:
49259 // (select (x != c), e, c) -> select (x != c), e, x),
49260 // (select (x == c), c, e) -> select (x == c), x, e)
49261 // where the c is an integer constant, and the "select" is the combination
49262 // of CMOV and CMP.
49263 //
49264 // The rationale for this change is that the conditional-move from a constant
49265 // needs two instructions, however, conditional-move from a register needs
49266 // only one instruction.
49267 //
49268 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49269 // some instruction-combining opportunities. This opt needs to be
49270 // postponed as late as possible.
49271 //
49272 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49273 // the DCI.xxxx conditions are provided to postpone the optimization as
49274 // late as possible.
49275
49276 ConstantSDNode *CmpAgainst = nullptr;
49277 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49278 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49279 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49280
49281 if (CC == X86::COND_NE &&
49282 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49284 std::swap(TrueOp, FalseOp);
49285 }
49286
49287 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49288 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49289 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49290 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49291 }
49292 }
49293 }
49294
49295 // Transform:
49296 //
49297 // (cmov 1 T (uge T 2))
49298 //
49299 // to:
49300 //
49301 // (adc T 0 (sub T 1))
49302 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49303 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49304 SDValue Cond0 = Cond.getOperand(0);
49305 if (Cond0.getOpcode() == ISD::TRUNCATE)
49306 Cond0 = Cond0.getOperand(0);
49307 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49308 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49309 EVT CondVT = Cond->getValueType(0);
49310 // Subtract 1 and generate a carry.
49311 SDValue NewSub =
49312 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49313 DAG.getConstant(1, DL, CondVT));
49314 SDValue EFLAGS(NewSub.getNode(), 1);
49315 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49316 DAG.getConstant(0, DL, VT), EFLAGS);
49317 }
49318 }
49319
49320 // Fold and/or of setcc's to double CMOV:
49321 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49322 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49323 //
49324 // This combine lets us generate:
49325 // cmovcc1 (jcc1 if we don't have CMOV)
49326 // cmovcc2 (same)
49327 // instead of:
49328 // setcc1
49329 // setcc2
49330 // and/or
49331 // cmovne (jne if we don't have CMOV)
49332 // When we can't use the CMOV instruction, it might increase branch
49333 // mispredicts.
49334 // When we can use CMOV, or when there is no mispredict, this improves
49335 // throughput and reduces register pressure.
49336 //
49337 if (CC == X86::COND_NE) {
49338 SDValue Flags;
49339 X86::CondCode CC0, CC1;
49340 bool isAndSetCC;
49341 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49342 if (isAndSetCC) {
49343 std::swap(FalseOp, TrueOp);
49346 }
49347
49348 SDValue LOps[] = {FalseOp, TrueOp,
49349 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49350 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49351 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49352 Flags};
49353 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49354 return CMOV;
49355 }
49356 }
49357
49358 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49359 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49360 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49361 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49362 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49363 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49364 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49365 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49366 SDValue Add = TrueOp;
49367 SDValue Const = FalseOp;
49368 // Canonicalize the condition code for easier matching and output.
49369 if (CC == X86::COND_E)
49370 std::swap(Add, Const);
49371
49372 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49373 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49374 Add.getResNo() == 0 && Add.hasOneUse() &&
49375 Add.getOperand(1) == Cond.getOperand(0)) {
49376 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49377 Add.getOperand(1));
49378 }
49379
49380 // We might have replaced the constant in the cmov with the LHS of the
49381 // compare. If so change it to the RHS of the compare.
49382 if (Const == Cond.getOperand(0))
49383 Const = Cond.getOperand(1);
49384
49385 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49386 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49387 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49388 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49389 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49390 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49391 // This should constant fold.
49392 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49393 SDValue CMov =
49394 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49395 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49396 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49397 }
49398 }
49399
49400 return SDValue();
49401}
49402
49403/// Different mul shrinking modes.
49405
49407 EVT VT = N->getOperand(0).getValueType();
49408 if (VT.getScalarSizeInBits() != 32)
49409 return false;
49410
49411 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49412 unsigned SignBits[2] = {1, 1};
49413 bool IsPositive[2] = {false, false};
49414 for (unsigned i = 0; i < 2; i++) {
49415 SDValue Opd = N->getOperand(i);
49416
49417 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49418 IsPositive[i] = DAG.SignBitIsZero(Opd);
49419 }
49420
49421 bool AllPositive = IsPositive[0] && IsPositive[1];
49422 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49423 // When ranges are from -128 ~ 127, use MULS8 mode.
49424 if (MinSignBits >= 25)
49426 // When ranges are from 0 ~ 255, use MULU8 mode.
49427 else if (AllPositive && MinSignBits >= 24)
49429 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49430 else if (MinSignBits >= 17)
49432 // When ranges are from 0 ~ 65535, use MULU16 mode.
49433 else if (AllPositive && MinSignBits >= 16)
49435 else
49436 return false;
49437 return true;
49438}
49439
49440/// When the operands of vector mul are extended from smaller size values,
49441/// like i8 and i16, the type of mul may be shrinked to generate more
49442/// efficient code. Two typical patterns are handled:
49443/// Pattern1:
49444/// %2 = sext/zext <N x i8> %1 to <N x i32>
49445/// %4 = sext/zext <N x i8> %3 to <N x i32>
49446// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49447/// %5 = mul <N x i32> %2, %4
49448///
49449/// Pattern2:
49450/// %2 = zext/sext <N x i16> %1 to <N x i32>
49451/// %4 = zext/sext <N x i16> %3 to <N x i32>
49452/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49453/// %5 = mul <N x i32> %2, %4
49454///
49455/// There are four mul shrinking modes:
49456/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49457/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49458/// generate pmullw+sext32 for it (MULS8 mode).
49459/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49460/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49461/// generate pmullw+zext32 for it (MULU8 mode).
49462/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49463/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49464/// generate pmullw+pmulhw for it (MULS16 mode).
49465/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49466/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49467/// generate pmullw+pmulhuw for it (MULU16 mode).
49469 const X86Subtarget &Subtarget) {
49470 // Check for legality
49471 // pmullw/pmulhw are not supported by SSE.
49472 if (!Subtarget.hasSSE2())
49473 return SDValue();
49474
49475 // Check for profitability
49476 // pmulld is supported since SSE41. It is better to use pmulld
49477 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49478 // the expansion.
49479 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49480 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49481 return SDValue();
49482
49484 if (!canReduceVMulWidth(N, DAG, Mode))
49485 return SDValue();
49486
49487 SDValue N0 = N->getOperand(0);
49488 SDValue N1 = N->getOperand(1);
49489 EVT VT = N->getOperand(0).getValueType();
49490 unsigned NumElts = VT.getVectorNumElements();
49491 if ((NumElts % 2) != 0)
49492 return SDValue();
49493
49494 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49495
49496 // Shrink the operands of mul.
49497 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49498 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49499
49500 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49501 // lower part is needed.
49502 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49506 DL, VT, MulLo);
49507
49508 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49509 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49510 // the higher part is also needed.
49511 SDValue MulHi =
49513 ReducedVT, NewN0, NewN1);
49514
49515 // Repack the lower part and higher part result of mul into a wider
49516 // result.
49517 // Generate shuffle functioning as punpcklwd.
49518 SmallVector<int, 16> ShuffleMask(NumElts);
49519 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49520 ShuffleMask[2 * i] = i;
49521 ShuffleMask[2 * i + 1] = i + NumElts;
49522 }
49523 SDValue ResLo =
49524 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49525 ResLo = DAG.getBitcast(ResVT, ResLo);
49526 // Generate shuffle functioning as punpckhwd.
49527 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49528 ShuffleMask[2 * i] = i + NumElts / 2;
49529 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49530 }
49531 SDValue ResHi =
49532 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49533 ResHi = DAG.getBitcast(ResVT, ResHi);
49534 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49535}
49536
49538 EVT VT, const SDLoc &DL) {
49539
49540 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49541 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49542 DAG.getConstant(Mult, DL, VT));
49543 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49544 DAG.getConstant(Shift, DL, MVT::i8));
49545 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49546 N->getOperand(0));
49547 return Result;
49548 };
49549
49550 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49551 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49552 DAG.getConstant(Mul1, DL, VT));
49553 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49554 DAG.getConstant(Mul2, DL, VT));
49555 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49556 N->getOperand(0));
49557 return Result;
49558 };
49559
49560 switch (MulAmt) {
49561 default:
49562 break;
49563 case 11:
49564 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49565 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49566 case 21:
49567 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49568 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49569 case 41:
49570 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49571 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49572 case 22:
49573 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49574 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49575 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49576 case 19:
49577 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49578 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49579 case 37:
49580 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49581 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49582 case 73:
49583 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49584 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49585 case 13:
49586 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49587 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49588 case 23:
49589 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49590 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49591 case 26:
49592 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49593 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49594 case 28:
49595 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49596 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49597 case 29:
49598 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49599 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49600 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49601 }
49602
49603 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49604 // by a single LEA.
49605 // First check if this a sum of two power of 2s because that's easy. Then
49606 // count how many zeros are up to the first bit.
49607 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49608 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49609 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49610 if (ScaleShift >= 1 && ScaleShift < 4) {
49611 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49612 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49613 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49614 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49615 DAG.getConstant(ScaleShift, DL, MVT::i8));
49616 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49617 }
49618 }
49619
49620 return SDValue();
49621}
49622
49623// If the upper 17 bits of either element are zero and the other element are
49624// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49625// PMULLD, except on KNL.
49627 SelectionDAG &DAG,
49628 const X86Subtarget &Subtarget) {
49629 if (!Subtarget.hasSSE2())
49630 return SDValue();
49631
49632 if (Subtarget.isPMADDWDSlow())
49633 return SDValue();
49634
49635 EVT VT = N->getValueType(0);
49636
49637 // Only support vXi32 vectors.
49638 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49639 return SDValue();
49640
49641 // Make sure the type is legal or can split/widen to a legal type.
49642 // With AVX512 but without BWI, we would need to split v32i16.
49643 unsigned NumElts = VT.getVectorNumElements();
49644 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49645 return SDValue();
49646
49647 // With AVX512 but without BWI, we would need to split v32i16.
49648 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49649 return SDValue();
49650
49651 SDValue N0 = N->getOperand(0);
49652 SDValue N1 = N->getOperand(1);
49653
49654 // If we are zero/sign extending two steps without SSE4.1, its better to
49655 // reduce the vmul width instead.
49656 if (!Subtarget.hasSSE41() &&
49657 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49658 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49659 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49660 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49661 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49662 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49663 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49664 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49665 return SDValue();
49666
49667 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49668 // the vmul width instead.
49669 if (!Subtarget.hasSSE41() &&
49670 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49671 N0.getOperand(0).getValueSizeInBits() > 128) &&
49672 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49673 N1.getOperand(0).getValueSizeInBits() > 128))
49674 return SDValue();
49675
49676 // Sign bits must extend down to the lowest i16.
49677 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49678 DAG.ComputeMaxSignificantBits(N0) > 16)
49679 return SDValue();
49680
49681 // At least one of the elements must be zero in the upper 17 bits, or can be
49682 // safely made zero without altering the final result.
49683 auto GetZeroableOp = [&](SDValue Op) {
49684 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49685 if (DAG.MaskedValueIsZero(Op, Mask17))
49686 return Op;
49687 // Mask off upper 16-bits of sign-extended constants.
49689 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49690 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49691 SDValue Src = Op.getOperand(0);
49692 // Convert sext(vXi16) to zext(vXi16).
49693 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49694 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49695 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49696 // which will expand the extension.
49697 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49698 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49699 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49700 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49701 }
49702 }
49703 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49704 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49705 N->isOnlyUserOf(Op.getNode())) {
49706 SDValue Src = Op.getOperand(0);
49707 if (Src.getScalarValueSizeInBits() == 16)
49708 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49709 }
49710 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49711 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49712 N->isOnlyUserOf(Op.getNode())) {
49713 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49714 Op.getOperand(1));
49715 }
49716 return SDValue();
49717 };
49718 SDValue ZeroN0 = GetZeroableOp(N0);
49719 SDValue ZeroN1 = GetZeroableOp(N1);
49720 if (!ZeroN0 && !ZeroN1)
49721 return SDValue();
49722 N0 = ZeroN0 ? ZeroN0 : N0;
49723 N1 = ZeroN1 ? ZeroN1 : N1;
49724
49725 // Use SplitOpsAndApply to handle AVX splitting.
49726 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49728 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49729 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49730 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49731 DAG.getBitcast(OpVT, Ops[0]),
49732 DAG.getBitcast(OpVT, Ops[1]));
49733 };
49734 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49735}
49736
49738 const X86Subtarget &Subtarget) {
49739 if (!Subtarget.hasSSE2())
49740 return SDValue();
49741
49742 EVT VT = N->getValueType(0);
49743
49744 // Only support vXi64 vectors.
49745 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49746 VT.getVectorNumElements() < 2 ||
49748 return SDValue();
49749
49750 SDValue N0 = N->getOperand(0);
49751 SDValue N1 = N->getOperand(1);
49752
49753 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49754 // 32-bits. We can lower with this if the sign bits stretch that far.
49755 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49756 DAG.ComputeNumSignBits(N1) > 32) {
49757 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49759 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49760 };
49761 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49762 /*CheckBWI*/ false);
49763 }
49764
49765 // If the upper bits are zero we can use a single pmuludq.
49766 APInt Mask = APInt::getHighBitsSet(64, 32);
49767 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49768 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49770 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49771 };
49772 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49773 /*CheckBWI*/ false);
49774 }
49775
49776 return SDValue();
49777}
49778
49781 const X86Subtarget &Subtarget) {
49782 EVT VT = N->getValueType(0);
49783 SDLoc DL(N);
49784
49785 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49786 return V;
49787
49788 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49789 return V;
49790
49791 if (DCI.isBeforeLegalize() && VT.isVector())
49792 return reduceVMULWidth(N, DL, DAG, Subtarget);
49793
49794 if (VT != MVT::i64 && VT != MVT::i32 &&
49795 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49796 return SDValue();
49797
49798 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49799 if (!Known1.isConstant())
49800 return SDValue();
49801
49802 const APInt &C = Known1.getConstant();
49803 if (C.isZero())
49804 return DAG.getConstant(0, DL, VT);
49805
49806 if (C.isAllOnes())
49807 return DAG.getNegative(N->getOperand(0), DL, VT);
49808
49809 if (isPowerOf2_64(C.getZExtValue()))
49810 return SDValue();
49811
49812 // Optimize a single multiply with constant into two operations in order to
49813 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49815 return SDValue();
49816
49817 // An imul is usually smaller than the alternative sequence.
49819 return SDValue();
49820
49821 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49822 return SDValue();
49823
49824 int64_t SignMulAmt = C.getSExtValue();
49825 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49826 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49827
49828 SDValue NewMul = SDValue();
49829 if (VT == MVT::i64 || VT == MVT::i32) {
49830 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49831 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49832 DAG.getConstant(AbsMulAmt, DL, VT));
49833 if (SignMulAmt < 0)
49834 NewMul = DAG.getNegative(NewMul, DL, VT);
49835
49836 return NewMul;
49837 }
49838
49839 uint64_t MulAmt1 = 0;
49840 uint64_t MulAmt2 = 0;
49841 if ((AbsMulAmt % 9) == 0) {
49842 MulAmt1 = 9;
49843 MulAmt2 = AbsMulAmt / 9;
49844 } else if ((AbsMulAmt % 5) == 0) {
49845 MulAmt1 = 5;
49846 MulAmt2 = AbsMulAmt / 5;
49847 } else if ((AbsMulAmt % 3) == 0) {
49848 MulAmt1 = 3;
49849 MulAmt2 = AbsMulAmt / 3;
49850 }
49851
49852 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49853 if (MulAmt2 &&
49854 (isPowerOf2_64(MulAmt2) ||
49855 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49856
49857 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49858 N->user_begin()->getOpcode() == ISD::ADD))
49859 // If second multiplifer is pow2, issue it first. We want the multiply
49860 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49861 // use is an add. Only do this for positive multiply amounts since the
49862 // negate would prevent it from being used as an address mode anyway.
49863 std::swap(MulAmt1, MulAmt2);
49864
49865 if (isPowerOf2_64(MulAmt1))
49866 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49867 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49868 else
49869 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49870 DAG.getConstant(MulAmt1, DL, VT));
49871
49872 if (isPowerOf2_64(MulAmt2))
49873 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49874 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49875 else
49876 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49877 DAG.getConstant(MulAmt2, DL, VT));
49878
49879 // Negate the result.
49880 if (SignMulAmt < 0)
49881 NewMul = DAG.getNegative(NewMul, DL, VT);
49882 } else if (!Subtarget.slowLEA())
49883 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49884 }
49885 if (!NewMul) {
49886 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49887 if (isPowerOf2_64(AbsMulAmt - 1)) {
49888 // (mul x, 2^N + 1) => (add (shl x, N), x)
49889 NewMul = DAG.getNode(
49890 ISD::ADD, DL, VT, N->getOperand(0),
49891 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49892 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49893 if (SignMulAmt < 0)
49894 NewMul = DAG.getNegative(NewMul, DL, VT);
49895 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49896 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49897 NewMul =
49898 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49899 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49900 // To negate, reverse the operands of the subtract.
49901 if (SignMulAmt < 0)
49902 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49903 else
49904 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49905 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49906 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49907 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49908 NewMul =
49909 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49910 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49911 NewMul = DAG.getNode(
49912 ISD::ADD, DL, VT, NewMul,
49913 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49914 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49915 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49916 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49917 NewMul =
49918 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49919 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49920 NewMul = DAG.getNode(
49921 ISD::SUB, DL, VT, NewMul,
49922 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49923 } else if (SignMulAmt >= 0 && VT.isVector() &&
49924 Subtarget.fastImmVectorShift()) {
49925 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49926 uint64_t ShiftAmt1;
49927 std::optional<unsigned> Opc;
49928 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49929 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49930 Opc = ISD::ADD;
49931 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49932 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49933 Opc = ISD::SUB;
49934 }
49935
49936 if (Opc) {
49937 SDValue Shift1 =
49938 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49939 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49940 SDValue Shift2 =
49941 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49942 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49943 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49944 }
49945 }
49946 }
49947
49948 return NewMul;
49949}
49950
49951// Try to form a MULHU or MULHS node by looking for
49952// (srl (mul ext, ext), 16)
49953// TODO: This is X86 specific because we want to be able to handle wide types
49954// before type legalization. But we can only do it if the vector will be
49955// legalized via widening/splitting. Type legalization can't handle promotion
49956// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49957// combiner.
49959 const SDLoc &DL,
49960 const X86Subtarget &Subtarget) {
49961 using namespace SDPatternMatch;
49962 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49963 "SRL or SRA node is required here!");
49964
49965 if (!Subtarget.hasSSE2())
49966 return SDValue();
49967
49968 // Input type should be at least vXi32.
49969 EVT VT = N->getValueType(0);
49970 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49971 return SDValue();
49972
49973 // The operation must be a multiply shifted right by 16.
49974 SDValue LHS, RHS;
49975 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49976 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49977 return SDValue();
49978
49979 unsigned ExtOpc = LHS.getOpcode();
49980 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49981 RHS.getOpcode() != ExtOpc)
49982 return SDValue();
49983
49984 // Peek through the extends.
49985 LHS = LHS.getOperand(0);
49986 RHS = RHS.getOperand(0);
49987
49988 // Ensure the input types match.
49989 EVT MulVT = LHS.getValueType();
49990 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49991 return SDValue();
49992
49993 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49994 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49995
49996 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49997 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49998}
49999
50001 const X86Subtarget &Subtarget) {
50002 using namespace llvm::SDPatternMatch;
50003 SDValue N0 = N->getOperand(0);
50004 SDValue N1 = N->getOperand(1);
50006 EVT VT = N0.getValueType();
50007 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50008 SDLoc DL(N);
50009
50010 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50011 // with out-of-bounds clamping.
50012 if (N0.getOpcode() == ISD::VSELECT &&
50013 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50014 SDValue Cond = N0.getOperand(0);
50015 SDValue N00 = N0.getOperand(1);
50016 SDValue N01 = N0.getOperand(2);
50017 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50019 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50021 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50022 }
50023 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50025 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50027 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50028 }
50029 }
50030
50031 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50032 // since the result of setcc_c is all zero's or all ones.
50033 if (VT.isInteger() && !VT.isVector() &&
50034 N1C && N0.getOpcode() == ISD::AND &&
50035 N0.getOperand(1).getOpcode() == ISD::Constant) {
50036 SDValue N00 = N0.getOperand(0);
50037 APInt Mask = N0.getConstantOperandAPInt(1);
50038 Mask <<= N1C->getAPIntValue();
50039 bool MaskOK = false;
50040 // We can handle cases concerning bit-widening nodes containing setcc_c if
50041 // we carefully interrogate the mask to make sure we are semantics
50042 // preserving.
50043 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50044 // of the underlying setcc_c operation if the setcc_c was zero extended.
50045 // Consider the following example:
50046 // zext(setcc_c) -> i32 0x0000FFFF
50047 // c1 -> i32 0x0000FFFF
50048 // c2 -> i32 0x00000001
50049 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50050 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50051 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50052 MaskOK = true;
50053 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50055 MaskOK = true;
50056 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50057 N00.getOpcode() == ISD::ANY_EXTEND) &&
50059 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50060 }
50061 if (MaskOK && Mask != 0)
50062 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50063 }
50064
50065 return SDValue();
50066}
50067
50069 const X86Subtarget &Subtarget) {
50070 using namespace llvm::SDPatternMatch;
50071 SDValue N0 = N->getOperand(0);
50072 SDValue N1 = N->getOperand(1);
50073 EVT VT = N0.getValueType();
50074 unsigned Size = VT.getSizeInBits();
50075 SDLoc DL(N);
50076
50077 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50078 return V;
50079
50080 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50081 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50082 SDValue ShrAmtVal;
50083 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50085 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50086 }
50087
50088 // fold (SRA (SHL X, ShlConst), SraConst)
50089 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50090 // or (sext_in_reg X)
50091 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50092 // depending on relation between SraConst and ShlConst.
50093 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50094 // us to do the sext_in_reg from corresponding bit.
50095
50096 // sexts in X86 are MOVs. The MOVs have the same code size
50097 // as above SHIFTs (only SHIFT on 1 has lower code size).
50098 // However the MOVs have 2 advantages to a SHIFT:
50099 // 1. MOVs can write to a register that differs from source
50100 // 2. MOVs accept memory operands
50101
50102 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50103 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50105 return SDValue();
50106
50107 SDValue N00 = N0.getOperand(0);
50108 SDValue N01 = N0.getOperand(1);
50109 APInt ShlConst = N01->getAsAPIntVal();
50110 APInt SraConst = N1->getAsAPIntVal();
50111 EVT CVT = N1.getValueType();
50112
50113 if (CVT != N01.getValueType())
50114 return SDValue();
50115 if (SraConst.isNegative())
50116 return SDValue();
50117
50118 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50119 unsigned ShiftSize = SVT.getSizeInBits();
50120 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50121 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50122 continue;
50123 SDValue NN =
50124 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50125 if (SraConst.eq(ShlConst))
50126 return NN;
50127 if (SraConst.ult(ShlConst))
50128 return DAG.getNode(ISD::SHL, DL, VT, NN,
50129 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50130 return DAG.getNode(ISD::SRA, DL, VT, NN,
50131 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50132 }
50133 return SDValue();
50134}
50135
50138 const X86Subtarget &Subtarget) {
50139 using namespace llvm::SDPatternMatch;
50140 SDValue N0 = N->getOperand(0);
50141 SDValue N1 = N->getOperand(1);
50142 EVT VT = N0.getValueType();
50143 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50144 SDLoc DL(N);
50145
50146 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50147 return V;
50148
50149 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50150 // with out-of-bounds clamping.
50151 if (N0.getOpcode() == ISD::VSELECT &&
50152 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50153 SDValue Cond = N0.getOperand(0);
50154 SDValue N00 = N0.getOperand(1);
50155 SDValue N01 = N0.getOperand(2);
50156 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50158 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50160 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50161 }
50162 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50164 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50166 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50167 }
50168 }
50169
50170 // Only do this on the last DAG combine as it can interfere with other
50171 // combines.
50172 if (!DCI.isAfterLegalizeDAG())
50173 return SDValue();
50174
50175 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50176 // TODO: This is a generic DAG combine that became an x86-only combine to
50177 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50178 // and-not ('andn').
50179 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50180 return SDValue();
50181
50182 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50183 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50184 if (!ShiftC || !AndC)
50185 return SDValue();
50186
50187 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50188 // transform should reduce code size. It may also enable secondary transforms
50189 // from improved known-bits analysis or instruction selection.
50190 APInt MaskVal = AndC->getAPIntValue();
50191
50192 // If this can be matched by a zero extend, don't optimize.
50193 if (MaskVal.isMask()) {
50194 unsigned TO = MaskVal.countr_one();
50195 if (TO >= 8 && isPowerOf2_32(TO))
50196 return SDValue();
50197 }
50198
50199 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50200 unsigned OldMaskSize = MaskVal.getSignificantBits();
50201 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50202 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50203 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50204 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50205 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50206 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50207 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50208 }
50209 return SDValue();
50210}
50211
50213 const X86Subtarget &Subtarget) {
50214 unsigned Opcode = N->getOpcode();
50215 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50216
50217 SDLoc DL(N);
50218 EVT VT = N->getValueType(0);
50219 SDValue N0 = N->getOperand(0);
50220 SDValue N1 = N->getOperand(1);
50221 EVT SrcVT = N0.getValueType();
50222
50223 SDValue BC0 =
50224 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50225 SDValue BC1 =
50226 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50227
50228 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50229 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50230 // truncation trees that help us avoid lane crossing shuffles.
50231 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50232 // TODO: We don't handle vXf64 shuffles yet.
50233 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50234 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50236 SmallVector<int> ShuffleMask, ScaledMask;
50237 SDValue Vec = peekThroughBitcasts(BCSrc);
50238 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50240 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50241 // shuffle to a v4X64 width - we can probably relax this in the future.
50242 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50243 ShuffleOps[0].getValueType().is256BitVector() &&
50244 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50245 SDValue Lo, Hi;
50246 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50247 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50248 Lo = DAG.getBitcast(SrcVT, Lo);
50249 Hi = DAG.getBitcast(SrcVT, Hi);
50250 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50251 Res = DAG.getBitcast(ShufVT, Res);
50252 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50253 return DAG.getBitcast(VT, Res);
50254 }
50255 }
50256 }
50257 }
50258
50259 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50260 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50261 // If either/both ops are a shuffle that can scale to v2x64,
50262 // then see if we can perform this as a v4x32 post shuffle.
50263 SmallVector<SDValue> Ops0, Ops1;
50264 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50265 bool IsShuf0 =
50266 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50267 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50268 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50269 bool IsShuf1 =
50270 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50271 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50272 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50273 if (IsShuf0 || IsShuf1) {
50274 if (!IsShuf0) {
50275 Ops0.assign({BC0});
50276 ScaledMask0.assign({0, 1});
50277 }
50278 if (!IsShuf1) {
50279 Ops1.assign({BC1});
50280 ScaledMask1.assign({0, 1});
50281 }
50282
50283 SDValue LHS, RHS;
50284 int PostShuffle[4] = {-1, -1, -1, -1};
50285 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50286 if (M < 0)
50287 return true;
50288 Idx = M % 2;
50289 SDValue Src = Ops[M / 2];
50290 if (!LHS || LHS == Src) {
50291 LHS = Src;
50292 return true;
50293 }
50294 if (!RHS || RHS == Src) {
50295 Idx += 2;
50296 RHS = Src;
50297 return true;
50298 }
50299 return false;
50300 };
50301 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50302 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50303 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50304 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50305 LHS = DAG.getBitcast(SrcVT, LHS);
50306 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50307 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50308 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50309 Res = DAG.getBitcast(ShufVT, Res);
50310 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50311 return DAG.getBitcast(VT, Res);
50312 }
50313 }
50314 }
50315
50316 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50317 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50318 SmallVector<int> Mask0, Mask1;
50319 SmallVector<SDValue> Ops0, Ops1;
50320 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50321 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50322 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50323 !Ops0.empty() && !Ops1.empty() &&
50324 all_of(Ops0,
50325 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50326 all_of(Ops1,
50327 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50328 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50329 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50330 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50331 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50332 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50333 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50334 if ((Op00 == Op11) && (Op01 == Op10)) {
50335 std::swap(Op10, Op11);
50337 }
50338 if ((Op00 == Op10) && (Op01 == Op11)) {
50339 const int Map[4] = {0, 2, 1, 3};
50340 SmallVector<int, 4> ShuffleMask(
50341 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50342 Map[ScaledMask1[1]]});
50343 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50344 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50345 DAG.getBitcast(SrcVT, Op01));
50346 Res = DAG.getBitcast(ShufVT, Res);
50347 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50348 return DAG.getBitcast(VT, Res);
50349 }
50350 }
50351 }
50352
50353 return SDValue();
50354}
50355
50358 const X86Subtarget &Subtarget) {
50359 unsigned Opcode = N->getOpcode();
50360 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50361 "Unexpected pack opcode");
50362
50363 EVT VT = N->getValueType(0);
50364 SDValue N0 = N->getOperand(0);
50365 SDValue N1 = N->getOperand(1);
50366 unsigned NumDstElts = VT.getVectorNumElements();
50367 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50368 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50369 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50370 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50371 "Unexpected PACKSS/PACKUS input type");
50372
50373 bool IsSigned = (X86ISD::PACKSS == Opcode);
50374
50375 // Constant Folding.
50376 APInt UndefElts0, UndefElts1;
50377 SmallVector<APInt, 32> EltBits0, EltBits1;
50378 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50379 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50380 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50381 /*AllowWholeUndefs*/ true,
50382 /*AllowPartialUndefs*/ true) &&
50383 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50384 /*AllowWholeUndefs*/ true,
50385 /*AllowPartialUndefs*/ true)) {
50386 unsigned NumLanes = VT.getSizeInBits() / 128;
50387 unsigned NumSrcElts = NumDstElts / 2;
50388 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50389 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50390
50391 APInt Undefs(NumDstElts, 0);
50392 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50393 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50394 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50395 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50396 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50397 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50398
50399 if (UndefElts[SrcIdx]) {
50400 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50401 continue;
50402 }
50403
50404 APInt &Val = EltBits[SrcIdx];
50405 if (IsSigned) {
50406 // PACKSS: Truncate signed value with signed saturation.
50407 // Source values less than dst minint are saturated to minint.
50408 // Source values greater than dst maxint are saturated to maxint.
50409 Val = Val.truncSSat(DstBitsPerElt);
50410 } else {
50411 // PACKUS: Truncate signed value with unsigned saturation.
50412 // Source values less than zero are saturated to zero.
50413 // Source values greater than dst maxuint are saturated to maxuint.
50414 // NOTE: This is different from APInt::truncUSat.
50415 if (Val.isIntN(DstBitsPerElt))
50416 Val = Val.trunc(DstBitsPerElt);
50417 else if (Val.isNegative())
50418 Val = APInt::getZero(DstBitsPerElt);
50419 else
50420 Val = APInt::getAllOnes(DstBitsPerElt);
50421 }
50422 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50423 }
50424 }
50425
50426 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50427 }
50428
50429 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50430 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50431 return V;
50432
50433 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50434 // Currently limit this to allsignbits cases only.
50435 if (IsSigned &&
50436 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50437 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50438 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50439 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50440 if (Not0 && Not1) {
50441 SDLoc DL(N);
50442 MVT SrcVT = N0.getSimpleValueType();
50443 SDValue Pack =
50444 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50445 DAG.getBitcast(SrcVT, Not1));
50446 return DAG.getNOT(DL, Pack, VT);
50447 }
50448 }
50449
50450 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50451 // truncate to create a larger truncate.
50452 if (Subtarget.hasAVX512() &&
50453 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50454 N0.getOperand(0).getValueType() == MVT::v8i32) {
50455 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50456 (!IsSigned &&
50457 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50458 if (Subtarget.hasVLX())
50459 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50460
50461 // Widen input to v16i32 so we can truncate that.
50462 SDLoc dl(N);
50463 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50464 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50465 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50466 }
50467 }
50468
50469 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50470 if (VT.is128BitVector()) {
50471 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50472 SDValue Src0, Src1;
50473 if (N0.getOpcode() == ExtOpc &&
50475 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50476 Src0 = N0.getOperand(0);
50477 }
50478 if (N1.getOpcode() == ExtOpc &&
50480 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50481 Src1 = N1.getOperand(0);
50482 }
50483 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50484 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50485 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50486 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50487 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50488 }
50489
50490 // Try again with pack(*_extend_vector_inreg, undef).
50491 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50493 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50494 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50495 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50496 DAG);
50497 }
50498
50499 // Attempt to combine as shuffle.
50500 SDValue Op(N, 0);
50501 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50502 return Res;
50503
50504 return SDValue();
50505}
50506
50509 const X86Subtarget &Subtarget) {
50510 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50511 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50512 "Unexpected horizontal add/sub opcode");
50513
50514 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50515 MVT VT = N->getSimpleValueType(0);
50516 SDValue LHS = N->getOperand(0);
50517 SDValue RHS = N->getOperand(1);
50518
50519 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50520 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50521 LHS.getOpcode() == RHS.getOpcode() &&
50522 LHS.getValueType() == RHS.getValueType() &&
50523 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50524 SDValue LHS0 = LHS.getOperand(0);
50525 SDValue LHS1 = LHS.getOperand(1);
50526 SDValue RHS0 = RHS.getOperand(0);
50527 SDValue RHS1 = RHS.getOperand(1);
50528 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50529 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50530 SDLoc DL(N);
50531 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50532 LHS0.isUndef() ? LHS1 : LHS0,
50533 RHS0.isUndef() ? RHS1 : RHS0);
50534 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50535 Res = DAG.getBitcast(ShufVT, Res);
50536 SDValue NewLHS =
50537 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50538 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50539 SDValue NewRHS =
50540 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50541 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50542 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50543 DAG.getBitcast(VT, NewRHS));
50544 }
50545 }
50546 }
50547
50548 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50549 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50550 return V;
50551
50552 return SDValue();
50553}
50554
50557 const X86Subtarget &Subtarget) {
50558 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50559 X86ISD::VSRL == N->getOpcode()) &&
50560 "Unexpected shift opcode");
50561 EVT VT = N->getValueType(0);
50562 SDValue N0 = N->getOperand(0);
50563 SDValue N1 = N->getOperand(1);
50564
50565 // Shift zero -> zero.
50567 return DAG.getConstant(0, SDLoc(N), VT);
50568
50569 // Detect constant shift amounts.
50570 APInt UndefElts;
50571 SmallVector<APInt, 32> EltBits;
50572 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50573 /*AllowWholeUndefs*/ true,
50574 /*AllowPartialUndefs*/ false)) {
50575 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50576 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50577 EltBits[0].getZExtValue(), DAG);
50578 }
50579
50580 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50581 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50582 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50583 return SDValue(N, 0);
50584
50585 return SDValue();
50586}
50587
50590 const X86Subtarget &Subtarget) {
50591 unsigned Opcode = N->getOpcode();
50592 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50593 X86ISD::VSRLI == Opcode) &&
50594 "Unexpected shift opcode");
50595 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50596 EVT VT = N->getValueType(0);
50597 SDValue N0 = N->getOperand(0);
50598 SDValue N1 = N->getOperand(1);
50599 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50600 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50601 "Unexpected value type");
50602 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50603
50604 // (shift undef, X) -> 0
50605 if (N0.isUndef())
50606 return DAG.getConstant(0, SDLoc(N), VT);
50607
50608 // Out of range logical bit shifts are guaranteed to be zero.
50609 // Out of range arithmetic bit shifts splat the sign bit.
50610 unsigned ShiftVal = N->getConstantOperandVal(1);
50611 if (ShiftVal >= NumBitsPerElt) {
50612 if (LogicalShift)
50613 return DAG.getConstant(0, SDLoc(N), VT);
50614 ShiftVal = NumBitsPerElt - 1;
50615 }
50616
50617 // (shift X, 0) -> X
50618 if (!ShiftVal)
50619 return N0;
50620
50621 // (shift 0, C) -> 0
50623 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50624 // result are all zeros, not undef.
50625 return DAG.getConstant(0, SDLoc(N), VT);
50626
50627 // (VSRAI -1, C) -> -1
50628 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50629 // N0 is all ones or undef. We guarantee that the bits shifted into the
50630 // result are all ones, not undef.
50631 return DAG.getAllOnesConstant(SDLoc(N), VT);
50632
50633 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50634 unsigned NewShiftVal = Amt0 + Amt1;
50635 if (NewShiftVal >= NumBitsPerElt) {
50636 // Out of range logical bit shifts are guaranteed to be zero.
50637 // Out of range arithmetic bit shifts splat the sign bit.
50638 if (LogicalShift)
50639 return DAG.getConstant(0, SDLoc(N), VT);
50640 NewShiftVal = NumBitsPerElt - 1;
50641 }
50642 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50643 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50644 };
50645
50646 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50647 if (Opcode == N0.getOpcode())
50648 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50649
50650 // (shl (add X, X), C) -> (shl X, (C + 1))
50651 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50652 N0.getOperand(0) == N0.getOperand(1))
50653 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50654
50655 // We can decode 'whole byte' logical bit shifts as shuffles.
50656 if (LogicalShift && (ShiftVal % 8) == 0) {
50657 SDValue Op(N, 0);
50658 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50659 return Res;
50660 }
50661
50662 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50663 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50664 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50665 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50666 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50667 N0.getOpcode() == X86ISD::PSHUFD &&
50668 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50669 N0->hasOneUse()) {
50671 if (BC.getOpcode() == X86ISD::VSHLI &&
50672 BC.getScalarValueSizeInBits() == 64 &&
50673 BC.getConstantOperandVal(1) == 63) {
50674 SDLoc DL(N);
50675 SDValue Src = BC.getOperand(0);
50676 Src = DAG.getBitcast(VT, Src);
50677 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50678 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50679 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50680 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50681 return Src;
50682 }
50683 }
50684
50685 auto TryConstantFold = [&](SDValue V) {
50686 APInt UndefElts;
50687 SmallVector<APInt, 32> EltBits;
50688 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50689 /*AllowWholeUndefs*/ true,
50690 /*AllowPartialUndefs*/ true))
50691 return SDValue();
50692 assert(EltBits.size() == VT.getVectorNumElements() &&
50693 "Unexpected shift value type");
50694 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50695 // created an undef input due to no input bits being demanded, but user
50696 // still expects 0 in other bits.
50697 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50698 APInt &Elt = EltBits[i];
50699 if (UndefElts[i])
50700 Elt = 0;
50701 else if (X86ISD::VSHLI == Opcode)
50702 Elt <<= ShiftVal;
50703 else if (X86ISD::VSRAI == Opcode)
50704 Elt.ashrInPlace(ShiftVal);
50705 else
50706 Elt.lshrInPlace(ShiftVal);
50707 }
50708 // Reset undef elements since they were zeroed above.
50709 UndefElts = 0;
50710 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50711 };
50712
50713 // Constant Folding.
50714 if (N->isOnlyUserOf(N0.getNode())) {
50715 if (SDValue C = TryConstantFold(N0))
50716 return C;
50717
50718 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50719 // Don't break NOT patterns.
50721 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50722 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50724 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50725 SDLoc DL(N);
50726 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50727 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50728 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50729 }
50730 }
50731 }
50732
50733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50734 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50735 DCI))
50736 return SDValue(N, 0);
50737
50738 return SDValue();
50739}
50740
50743 const X86Subtarget &Subtarget) {
50744 EVT VT = N->getValueType(0);
50745 unsigned Opcode = N->getOpcode();
50746 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50747 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50748 Opcode == ISD::INSERT_VECTOR_ELT) &&
50749 "Unexpected vector insertion");
50750
50751 SDValue Vec = N->getOperand(0);
50752 SDValue Scl = N->getOperand(1);
50753 SDValue Idx = N->getOperand(2);
50754
50755 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50756 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50757 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50758
50759 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50760 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50762 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50763 APInt::getAllOnes(NumBitsPerElt), DCI))
50764 return SDValue(N, 0);
50765 }
50766
50767 // Attempt to combine insertion patterns to a shuffle.
50768 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50769 SDValue Op(N, 0);
50770 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50771 return Res;
50772 }
50773
50774 return SDValue();
50775}
50776
50777/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50778/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50779/// OR -> CMPNEQSS.
50782 const X86Subtarget &Subtarget) {
50783 unsigned opcode;
50784
50785 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50786 // we're requiring SSE2 for both.
50787 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50788 SDValue N0 = N->getOperand(0);
50789 SDValue N1 = N->getOperand(1);
50790 SDValue CMP0 = N0.getOperand(1);
50791 SDValue CMP1 = N1.getOperand(1);
50792 SDLoc DL(N);
50793
50794 // The SETCCs should both refer to the same CMP.
50795 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50796 return SDValue();
50797
50798 SDValue CMP00 = CMP0->getOperand(0);
50799 SDValue CMP01 = CMP0->getOperand(1);
50800 EVT VT = CMP00.getValueType();
50801
50802 if (VT == MVT::f32 || VT == MVT::f64 ||
50803 (VT == MVT::f16 && Subtarget.hasFP16())) {
50804 bool ExpectingFlags = false;
50805 // Check for any users that want flags:
50806 for (const SDNode *U : N->users()) {
50807 if (ExpectingFlags)
50808 break;
50809
50810 switch (U->getOpcode()) {
50811 default:
50812 case ISD::BR_CC:
50813 case ISD::BRCOND:
50814 case ISD::SELECT:
50815 ExpectingFlags = true;
50816 break;
50817 case ISD::CopyToReg:
50818 case ISD::SIGN_EXTEND:
50819 case ISD::ZERO_EXTEND:
50820 case ISD::ANY_EXTEND:
50821 break;
50822 }
50823 }
50824
50825 if (!ExpectingFlags) {
50826 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50827 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50828
50829 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50830 X86::CondCode tmp = cc0;
50831 cc0 = cc1;
50832 cc1 = tmp;
50833 }
50834
50835 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50836 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50837 // FIXME: need symbolic constants for these magic numbers.
50838 // See X86ATTInstPrinter.cpp:printSSECC().
50839 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50840 if (Subtarget.hasAVX512()) {
50841 SDValue FSetCC =
50842 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50843 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50844 // Need to fill with zeros to ensure the bitcast will produce zeroes
50845 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50846 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50847 DAG.getConstant(0, DL, MVT::v16i1),
50848 FSetCC, DAG.getVectorIdxConstant(0, DL));
50849 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50850 N->getSimpleValueType(0));
50851 }
50852 SDValue OnesOrZeroesF =
50853 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50854 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50855
50856 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50857 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50858
50859 if (is64BitFP && !Subtarget.is64Bit()) {
50860 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50861 // 64-bit integer, since that's not a legal type. Since
50862 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50863 // bits, but can do this little dance to extract the lowest 32 bits
50864 // and work with those going forward.
50865 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50866 MVT::v2f64, OnesOrZeroesF);
50867 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50868 OnesOrZeroesF =
50869 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50870 DAG.getVectorIdxConstant(0, DL));
50871 IntVT = MVT::i32;
50872 }
50873
50874 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50875 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50876 DAG.getConstant(1, DL, IntVT));
50877 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50878 ANDed);
50879 return OneBitOfTruth;
50880 }
50881 }
50882 }
50883 }
50884 return SDValue();
50885}
50886
50887/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50889 SelectionDAG &DAG) {
50890 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50891
50892 MVT VT = N->getSimpleValueType(0);
50893 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50894 return SDValue();
50895
50896 SDValue X, Y;
50897 SDValue N0 = N->getOperand(0);
50898 SDValue N1 = N->getOperand(1);
50899
50900 if (SDValue Not = IsNOT(N0, DAG)) {
50901 X = Not;
50902 Y = N1;
50903 } else if (SDValue Not = IsNOT(N1, DAG)) {
50904 X = Not;
50905 Y = N0;
50906 } else
50907 return SDValue();
50908
50909 X = DAG.getBitcast(VT, X);
50910 Y = DAG.getBitcast(VT, Y);
50911 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50912}
50913
50914/// Try to fold:
50915/// and (vector_shuffle<Z,...,Z>
50916/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50917/// ->
50918/// andnp (vector_shuffle<Z,...,Z>
50919/// (insert_vector_elt undef, X, Z), undef), Y
50921 const X86Subtarget &Subtarget) {
50922 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50923
50924 EVT VT = N->getValueType(0);
50925 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50926 // value and require extra moves.
50927 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50928 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50929 return SDValue();
50930
50931 auto GetNot = [&DAG](SDValue V) {
50933 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50934 // end-users are ISD::AND including cases
50935 // (and(extract_vector_element(SVN), Y)).
50936 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50937 !SVN->getOperand(1).isUndef()) {
50938 return SDValue();
50939 }
50940 SDValue IVEN = SVN->getOperand(0);
50941 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50942 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50943 return SDValue();
50944 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50945 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50946 return SDValue();
50947 SDValue Src = IVEN.getOperand(1);
50948 if (SDValue Not = IsNOT(Src, DAG)) {
50949 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50950 SDValue NotIVEN =
50952 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50953 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50954 SVN->getOperand(1), SVN->getMask());
50955 }
50956 return SDValue();
50957 };
50958
50959 SDValue X, Y;
50960 SDValue N0 = N->getOperand(0);
50961 SDValue N1 = N->getOperand(1);
50962 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50963
50964 if (SDValue Not = GetNot(N0)) {
50965 X = Not;
50966 Y = N1;
50967 } else if (SDValue Not = GetNot(N1)) {
50968 X = Not;
50969 Y = N0;
50970 } else
50971 return SDValue();
50972
50973 X = DAG.getBitcast(VT, X);
50974 Y = DAG.getBitcast(VT, Y);
50975 SDLoc DL(N);
50976
50977 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50978 // AVX2.
50979 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50981 SDValue LoX, HiX;
50982 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50983 SDValue LoY, HiY;
50984 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50985 EVT SplitVT = LoX.getValueType();
50986 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50987 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50988 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50989 }
50990
50991 if (TLI.isTypeLegal(VT))
50992 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50993
50994 return SDValue();
50995}
50996
50997// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50998// logical operations, like in the example below.
50999// or (and (truncate x, truncate y)),
51000// (xor (truncate z, build_vector (constants)))
51001// Given a target type \p VT, we generate
51002// or (and x, y), (xor z, zext(build_vector (constants)))
51003// given x, y and z are of type \p VT. We can do so, if operands are either
51004// truncates from VT types, the second operand is a vector of constants, can
51005// be recursively promoted or is an existing extension we can extend further.
51007 SelectionDAG &DAG,
51008 const X86Subtarget &Subtarget,
51009 unsigned Depth) {
51010 // Limit recursion to avoid excessive compile times.
51012 return SDValue();
51013
51014 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51015 return SDValue();
51016
51017 SDValue N0 = N.getOperand(0);
51018 SDValue N1 = N.getOperand(1);
51019
51020 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51021 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51022 return SDValue();
51023
51024 if (SDValue NN0 =
51025 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51026 N0 = NN0;
51027 else {
51028 // The left side has to be a 'trunc'.
51029 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51030 N0.getOperand(0).getValueType() == VT;
51031 if (LHSTrunc)
51032 N0 = N0.getOperand(0);
51033 else
51034 return SDValue();
51035 }
51036
51037 if (SDValue NN1 =
51038 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51039 N1 = NN1;
51040 else {
51041 // The right side has to be a 'trunc', a (foldable) constant or an
51042 // existing extension we can extend further.
51043 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51044 N1.getOperand(0).getValueType() == VT;
51045 if (RHSTrunc)
51046 N1 = N1.getOperand(0);
51047 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51048 Subtarget.hasInt256() && N1.hasOneUse())
51049 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51050 else if (SDValue Cst =
51052 N1 = Cst;
51053 else
51054 return SDValue();
51055 }
51056
51057 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51058}
51059
51060// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51061// register. In most cases we actually compare or select YMM-sized registers
51062// and mixing the two types creates horrible code. This method optimizes
51063// some of the transition sequences.
51064// Even with AVX-512 this is still useful for removing casts around logical
51065// operations on vXi1 mask types.
51067 SelectionDAG &DAG,
51068 const X86Subtarget &Subtarget) {
51069 EVT VT = N.getValueType();
51070 assert(VT.isVector() && "Expected vector type");
51071 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51072 N.getOpcode() == ISD::ZERO_EXTEND ||
51073 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51074
51075 SDValue Narrow = N.getOperand(0);
51076 EVT NarrowVT = Narrow.getValueType();
51077
51078 // Generate the wide operation.
51079 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51080 if (!Op)
51081 return SDValue();
51082 switch (N.getOpcode()) {
51083 default: llvm_unreachable("Unexpected opcode");
51084 case ISD::ANY_EXTEND:
51085 return Op;
51086 case ISD::ZERO_EXTEND:
51087 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51088 case ISD::SIGN_EXTEND:
51089 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51090 Op, DAG.getValueType(NarrowVT));
51091 }
51092}
51093
51094static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51095 unsigned FPOpcode;
51096 switch (Opcode) {
51097 // clang-format off
51098 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51099 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51100 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51101 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51102 // clang-format on
51103 }
51104 return FPOpcode;
51105}
51106
51107/// If both input operands of a logic op are being cast from floating-point
51108/// types or FP compares, try to convert this into a floating-point logic node
51109/// to avoid unnecessary moves from SSE to integer registers.
51110static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51111 SDValue N0, SDValue N1,
51112 SelectionDAG &DAG,
51114 const X86Subtarget &Subtarget) {
51115 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51116 "Unexpected bit opcode");
51117
51118 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51119 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51120 return SDValue();
51121
51122 SDValue N00 = N0.getOperand(0);
51123 SDValue N10 = N1.getOperand(0);
51124 EVT N00Type = N00.getValueType();
51125 EVT N10Type = N10.getValueType();
51126
51127 // Ensure that both types are the same and are legal scalar fp types.
51128 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51129 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51130 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51131 return SDValue();
51132
51133 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51134 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51135 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51136 return DAG.getBitcast(VT, FPLogic);
51137 }
51138
51139 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51140 !N1.hasOneUse())
51141 return SDValue();
51142
51143 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51144 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51145
51146 // The vector ISA for FP predicates is incomplete before AVX, so converting
51147 // COMIS* to CMPS* may not be a win before AVX.
51148 if (!Subtarget.hasAVX() &&
51149 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51150 return SDValue();
51151
51152 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51153 // and vector logic:
51154 // logic (setcc N00, N01), (setcc N10, N11) -->
51155 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51156 unsigned NumElts = 128 / N00Type.getSizeInBits();
51157 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51158 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51159 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51160 SDValue N01 = N0.getOperand(1);
51161 SDValue N11 = N1.getOperand(1);
51162 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51163 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51164 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51165 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51166 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51167 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51168 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51169 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51170}
51171
51172// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51173// to reduce XMM->GPR traffic.
51174static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51175 SDValue N1, SelectionDAG &DAG) {
51176 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51177 "Unexpected bit opcode");
51178
51179 // Both operands must be single use MOVMSK.
51180 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51181 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51182 return SDValue();
51183
51184 SDValue Vec0 = N0.getOperand(0);
51185 SDValue Vec1 = N1.getOperand(0);
51186 EVT VecVT0 = Vec0.getValueType();
51187 EVT VecVT1 = Vec1.getValueType();
51188
51189 // Both MOVMSK operands must be from vectors of the same size and same element
51190 // size, but its OK for a fp/int diff.
51191 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51192 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51193 return SDValue();
51194
51195 unsigned VecOpc =
51197 SDValue Result =
51198 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51199 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51200}
51201
51202// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51203// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51204// handles in InstCombine.
51205static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51206 SDValue N0, SDValue N1,
51207 SelectionDAG &DAG) {
51208 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51209 "Unexpected bit opcode");
51210
51211 // Both operands must be single use.
51212 if (!N0.hasOneUse() || !N1.hasOneUse())
51213 return SDValue();
51214
51215 // Search for matching shifts.
51218
51219 unsigned BCOpc = BC0.getOpcode();
51220 EVT BCVT = BC0.getValueType();
51221 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51222 return SDValue();
51223
51224 switch (BCOpc) {
51225 case X86ISD::VSHLI:
51226 case X86ISD::VSRLI:
51227 case X86ISD::VSRAI: {
51228 if (BC0.getOperand(1) != BC1.getOperand(1))
51229 return SDValue();
51230 SDValue BitOp =
51231 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51232 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51233 return DAG.getBitcast(VT, Shift);
51234 }
51235 }
51236
51237 return SDValue();
51238}
51239
51240// Attempt to fold:
51241// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51242// TODO: Handle PACKUS handling.
51243static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51244 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51245 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51246 "Unexpected bit opcode");
51247
51248 // Both operands must be single use.
51249 if (!N0.hasOneUse() || !N1.hasOneUse())
51250 return SDValue();
51251
51252 // Search for matching packs.
51255
51256 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51257 return SDValue();
51258
51259 MVT DstVT = N0.getSimpleValueType();
51260 if (DstVT != N1.getSimpleValueType())
51261 return SDValue();
51262
51263 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51264 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51265
51266 // Limit to allsignbits packing.
51267 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51268 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51269 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51270 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51271 return SDValue();
51272
51273 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51274 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51275 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51276}
51277
51278/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51279/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51280/// with a shift-right to eliminate loading the vector constant mask value.
51282 SelectionDAG &DAG,
51283 const X86Subtarget &Subtarget) {
51284 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51285 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51286 EVT VT = Op0.getValueType();
51287 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51288 return SDValue();
51289
51290 // Try to convert an "is positive" signbit masking operation into arithmetic
51291 // shift and "andn". This saves a materialization of a -1 vector constant.
51292 // The "is negative" variant should be handled more generally because it only
51293 // requires "and" rather than "andn":
51294 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51295 //
51296 // This is limited to the original type to avoid producing even more bitcasts.
51297 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51298 // will be profitable.
51299 if (N->getValueType(0) == VT &&
51300 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51301 SDValue X, Y;
51302 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51303 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51304 X = Op1.getOperand(0);
51305 Y = Op0;
51306 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51307 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51308 X = Op0.getOperand(0);
51309 Y = Op1;
51310 }
51311 if (X && Y) {
51312 SDValue Sra =
51314 VT.getScalarSizeInBits() - 1, DAG);
51315 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51316 }
51317 }
51318
51319 APInt SplatVal;
51320 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51321 return SDValue();
51322
51323 // Don't prevent creation of ANDN.
51324 if (isBitwiseNot(Op0))
51325 return SDValue();
51326
51327 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51328 return SDValue();
51329
51330 unsigned EltBitWidth = VT.getScalarSizeInBits();
51331 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51332 return SDValue();
51333
51334 unsigned ShiftVal = SplatVal.countr_one();
51335 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51336 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51337 return DAG.getBitcast(N->getValueType(0), Shift);
51338}
51339
51340// Get the index node from the lowered DAG of a GEP IR instruction with one
51341// indexing dimension.
51343 if (Ld->isIndexed())
51344 return SDValue();
51345
51346 SDValue Base = Ld->getBasePtr();
51347 if (Base.getOpcode() != ISD::ADD)
51348 return SDValue();
51349
51350 SDValue ShiftedIndex = Base.getOperand(0);
51351 if (ShiftedIndex.getOpcode() != ISD::SHL)
51352 return SDValue();
51353
51354 return ShiftedIndex.getOperand(0);
51355}
51356
51357static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51358 return Subtarget.hasBMI2() &&
51359 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51360}
51361
51362/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51363/// This undoes the inverse fold performed in InstCombine
51365 SelectionDAG &DAG) {
51366 using namespace llvm::SDPatternMatch;
51367 MVT VT = N->getSimpleValueType(0);
51368 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51369 return SDValue();
51370
51371 SDValue X, Y, Z;
51372 if (sd_match(N, m_And(m_Value(X),
51373 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51374 // Don't fold if Y or Z are constants to prevent infinite loops.
51377 return DAG.getNode(
51378 ISD::AND, DL, VT, X,
51379 DAG.getNOT(
51380 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51381 }
51382
51383 return SDValue();
51384}
51385
51386// This function recognizes cases where X86 bzhi instruction can replace and
51387// 'and-load' sequence.
51388// In case of loading integer value from an array of constants which is defined
51389// as follows:
51390//
51391// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51392//
51393// then applying a bitwise and on the result with another input.
51394// It's equivalent to performing bzhi (zero high bits) on the input, with the
51395// same index of the load.
51397 const X86Subtarget &Subtarget) {
51398 MVT VT = Node->getSimpleValueType(0);
51399 SDLoc dl(Node);
51400
51401 // Check if subtarget has BZHI instruction for the node's type
51402 if (!hasBZHI(Subtarget, VT))
51403 return SDValue();
51404
51405 // Try matching the pattern for both operands.
51406 for (unsigned i = 0; i < 2; i++) {
51407 // continue if the operand is not a load instruction
51408 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51409 if (!Ld)
51410 continue;
51411 const Value *MemOp = Ld->getMemOperand()->getValue();
51412 if (!MemOp)
51413 continue;
51414 // Get the Node which indexes into the array.
51416 if (!Index)
51417 continue;
51418
51419 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51420 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51421 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51422 Constant *Init = GV->getInitializer();
51423 Type *Ty = Init->getType();
51425 !Ty->getArrayElementType()->isIntegerTy() ||
51426 Ty->getArrayElementType()->getScalarSizeInBits() !=
51427 VT.getSizeInBits() ||
51428 Ty->getArrayNumElements() >
51429 Ty->getArrayElementType()->getScalarSizeInBits())
51430 continue;
51431
51432 // Check if the array's constant elements are suitable to our case.
51433 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51434 bool ConstantsMatch = true;
51435 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51436 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51437 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51438 ConstantsMatch = false;
51439 break;
51440 }
51441 }
51442 if (!ConstantsMatch)
51443 continue;
51444
51445 // Do the transformation (For 32-bit type):
51446 // -> (and (load arr[idx]), inp)
51447 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51448 // that will be replaced with one bzhi instruction.
51449 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51450 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51451
51452 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51453 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51454 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51455
51456 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51457 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51458 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51459 }
51460 }
51461 }
51462 }
51463 return SDValue();
51464}
51465
51466// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51467// Where C is a mask containing the same number of bits as the setcc and
51468// where the setcc will freely 0 upper bits of k-register. We can replace the
51469// undef in the concat with 0s and remove the AND. This mainly helps with
51470// v2i1/v4i1 setcc being casted to scalar.
51472 const X86Subtarget &Subtarget) {
51473 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51474
51475 EVT VT = N->getValueType(0);
51476
51477 // Make sure this is an AND with constant. We will check the value of the
51478 // constant later.
51479 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51480 if (!C1)
51481 return SDValue();
51482
51483 // This is implied by the ConstantSDNode.
51484 assert(!VT.isVector() && "Expected scalar VT!");
51485
51486 SDValue Src = N->getOperand(0);
51487 if (!Src.hasOneUse())
51488 return SDValue();
51489
51490 // (Optionally) peek through any_extend().
51491 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51492 if (!Src.getOperand(0).hasOneUse())
51493 return SDValue();
51494 Src = Src.getOperand(0);
51495 }
51496
51497 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51498 return SDValue();
51499
51500 Src = Src.getOperand(0);
51501 EVT SrcVT = Src.getValueType();
51502
51503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51504 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51505 !TLI.isTypeLegal(SrcVT))
51506 return SDValue();
51507
51508 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51509 return SDValue();
51510
51511 // We only care about the first subvector of the concat, we expect the
51512 // other subvectors to be ignored due to the AND if we make the change.
51513 SDValue SubVec = Src.getOperand(0);
51514 EVT SubVecVT = SubVec.getValueType();
51515
51516 // The RHS of the AND should be a mask with as many bits as SubVec.
51517 if (!TLI.isTypeLegal(SubVecVT) ||
51518 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51519 return SDValue();
51520
51521 // First subvector should be a setcc with a legal result type or a
51522 // AND containing at least one setcc with a legal result type.
51523 auto IsLegalSetCC = [&](SDValue V) {
51524 if (V.getOpcode() != ISD::SETCC)
51525 return false;
51526 EVT SetccVT = V.getOperand(0).getValueType();
51527 if (!TLI.isTypeLegal(SetccVT) ||
51528 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51529 return false;
51530 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51531 return false;
51532 return true;
51533 };
51534 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51535 (IsLegalSetCC(SubVec.getOperand(0)) ||
51536 IsLegalSetCC(SubVec.getOperand(1))))))
51537 return SDValue();
51538
51539 // We passed all the checks. Rebuild the concat_vectors with zeroes
51540 // and cast it back to VT.
51541 SDLoc dl(N);
51542 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51543 DAG.getConstant(0, dl, SubVecVT));
51544 Ops[0] = SubVec;
51545 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51546 Ops);
51547 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51548 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51549}
51550
51552 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51553 // We don't want to go crazy with the recursion here. This isn't a super
51554 // important optimization.
51555 static constexpr unsigned kMaxDepth = 2;
51556
51557 // Only do this re-ordering if op has one use.
51558 if (!Op.hasOneUse())
51559 return SDValue();
51560
51561 SDLoc DL(Op);
51562 // If we hit another assosiative op, recurse further.
51563 if (Op.getOpcode() == Opc) {
51564 // Done recursing.
51565 if (Depth++ >= kMaxDepth)
51566 return SDValue();
51567
51568 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51569 if (SDValue R =
51570 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51571 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51572 Op.getOperand(1 - OpIdx));
51573
51574 } else if (Op.getOpcode() == ISD::SUB) {
51575 if (Opc == ISD::AND) {
51576 // BLSI: (and x, (sub 0, x))
51577 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51578 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51579 }
51580 // Opc must be ISD::AND or ISD::XOR
51581 // BLSR: (and x, (sub x, 1))
51582 // BLSMSK: (xor x, (sub x, 1))
51583 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51584 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51585
51586 } else if (Op.getOpcode() == ISD::ADD) {
51587 // Opc must be ISD::AND or ISD::XOR
51588 // BLSR: (and x, (add x, -1))
51589 // BLSMSK: (xor x, (add x, -1))
51590 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51591 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51592 }
51593 return SDValue();
51594}
51595
51597 const X86Subtarget &Subtarget) {
51598 EVT VT = N->getValueType(0);
51599 // Make sure this node is a candidate for BMI instructions.
51600 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51601 (VT != MVT::i32 && VT != MVT::i64))
51602 return SDValue();
51603
51604 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51605
51606 // Try and match LHS and RHS.
51607 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51608 if (SDValue OpMatch =
51609 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51610 N->getOperand(1 - OpIdx), 0))
51611 return OpMatch;
51612 return SDValue();
51613}
51614
51615/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51617 SelectionDAG &DAG,
51618 const X86Subtarget &Subtarget) {
51619 using namespace llvm::SDPatternMatch;
51620
51621 EVT VT = And->getValueType(0);
51622 // Make sure this node is a candidate for BMI instructions.
51623 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51624 return SDValue();
51625
51626 SDValue X;
51627 SDValue Y;
51630 m_Value(Y))))
51631 return SDValue();
51632
51633 SDValue BLSMSK =
51634 DAG.getNode(ISD::XOR, DL, VT, X,
51635 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51636 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51637 return AndN;
51638}
51639
51641 SelectionDAG &DAG,
51643 const X86Subtarget &ST) {
51644 // cmp(setcc(cc, X), 0)
51645 // brcond ne
51646 // ->
51647 // X
51648 // brcond cc
51649
51650 // sub(setcc(cc, X), 1)
51651 // brcond ne
51652 // ->
51653 // X
51654 // brcond ~cc
51655 //
51656 // if only flag has users
51657
51658 SDValue SetCC = N->getOperand(0);
51659
51660 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51661 return SDValue();
51662
51663 // Check the only user of flag is `brcond ne`.
51664 SDNode *BrCond = *Flag->user_begin();
51665 if (BrCond->getOpcode() != X86ISD::BRCOND)
51666 return SDValue();
51667 unsigned CondNo = 2;
51668 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51670 return SDValue();
51671
51672 SDValue X = SetCC.getOperand(1);
51673 // sub has two results while X only have one. DAG combine assumes the value
51674 // type matches.
51675 if (N->getOpcode() == X86ISD::SUB)
51676 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51677
51678 SDValue CCN = SetCC.getOperand(0);
51679 X86::CondCode CC =
51680 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51682 // Update CC for the consumer of the flag.
51683 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51684 // checking if the second condition evaluates to true. When comparing the
51685 // result with 1, we are checking uf the second condition evaluates to false.
51687 if (isNullConstant(N->getOperand(1)))
51688 Ops[CondNo] = CCN;
51689 else if (isOneConstant(N->getOperand(1)))
51690 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51691 else
51692 llvm_unreachable("expect constant 0 or 1");
51693
51694 SDValue NewBrCond =
51695 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51696 // Avoid self-assign error b/c CC1 can be `e/ne`.
51697 if (BrCond != NewBrCond.getNode())
51698 DCI.CombineTo(BrCond, NewBrCond);
51699 return X;
51700}
51701
51704 const X86Subtarget &ST) {
51705 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51706 // ->
51707 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51708
51709 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51710 // ->
51711 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51712 //
51713 // where cflags is determined by cc1.
51714
51715 if (!ST.hasCCMP())
51716 return SDValue();
51717
51718 SDValue SetCC0 = N->getOperand(0);
51719 SDValue SetCC1 = N->getOperand(1);
51720 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51721 SetCC1.getOpcode() != X86ISD::SETCC)
51722 return SDValue();
51723
51724 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51725 SDValue Op = V.getOperand(1);
51726 unsigned Opc = Op.getOpcode();
51727 if (Opc == X86ISD::SUB)
51728 return X86ISD::CCMP;
51729 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51730 return X86ISD::CTEST;
51731 return 0U;
51732 };
51733
51734 unsigned NewOpc = 0;
51735
51736 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51737 // appear on the right.
51738 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51739 std::swap(SetCC0, SetCC1);
51740 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51741 return SDValue();
51742 }
51743
51744 X86::CondCode CC0 =
51745 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51746 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51747 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51748 return SDValue();
51749
51750 bool IsOR = N->getOpcode() == ISD::OR;
51751
51752 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51753 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51754 // operator is OR. Similar for CC1.
51755 SDValue SrcCC =
51757 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51758 : SetCC0.getOperand(0);
51759 SDValue CC1N = SetCC1.getOperand(0);
51760 X86::CondCode CC1 =
51761 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51763 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51764 SDLoc DL(N);
51765 SDValue CFlags = DAG.getTargetConstant(
51766 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51767 SDValue Sub = SetCC1.getOperand(1);
51768
51769 // Replace any uses of the old flag produced by SUB/CMP with the new one
51770 // produced by CCMP/CTEST.
51771 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51772 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51773 {Sub.getOperand(0), Sub.getOperand(1),
51774 CFlags, SrcCC, SetCC0.getOperand(1)})
51775 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51776 {Sub.getOperand(0), Sub.getOperand(0),
51777 CFlags, SrcCC, SetCC0.getOperand(1)});
51778
51779 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51780}
51781
51784 const X86Subtarget &Subtarget) {
51785 using namespace SDPatternMatch;
51786
51787 SDValue N0 = N->getOperand(0);
51788 SDValue N1 = N->getOperand(1);
51789 EVT VT = N->getValueType(0);
51790 SDLoc dl(N);
51791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51792
51793 // If this is SSE1 only convert to FAND to avoid scalarization.
51794 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51795 return DAG.getBitcast(MVT::v4i32,
51796 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51797 DAG.getBitcast(MVT::v4f32, N0),
51798 DAG.getBitcast(MVT::v4f32, N1)));
51799 }
51800
51801 // Use a 32-bit and+zext if upper bits known zero.
51802 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51803 APInt HiMask = APInt::getHighBitsSet(64, 32);
51804 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51805 DAG.MaskedValueIsZero(N0, HiMask)) {
51806 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51807 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51808 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51809 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51810 }
51811 }
51812
51813 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51814 // TODO: Support multiple SrcOps.
51815 if (VT == MVT::i1) {
51817 SmallVector<APInt, 2> SrcPartials;
51818 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51819 SrcOps.size() == 1) {
51820 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51821 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51822 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51823 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51824 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51825 if (Mask) {
51826 assert(SrcPartials[0].getBitWidth() == NumElts &&
51827 "Unexpected partial reduction mask");
51828 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51829 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51830 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51831 }
51832 }
51833 }
51834
51835 // InstCombine converts:
51836 // `(-x << C0) & C1`
51837 // to
51838 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51839 // This saves an IR instruction but on x86 the neg/shift version is preferable
51840 // so undo the transform.
51841
51842 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51843 // TODO: We don't actually need a splat for this, we just need the checks to
51844 // hold for each element.
51845 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51846 /*AllowTruncation*/ false);
51847 ConstantSDNode *N01C =
51848 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51849 /*AllowTruncation*/ false);
51850 if (N1C && N01C) {
51851 const APInt &MulC = N01C->getAPIntValue();
51852 const APInt &AndC = N1C->getAPIntValue();
51853 APInt MulCLowBit = MulC & (-MulC);
51854 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51855 (MulCLowBit + MulC).isPowerOf2()) {
51856 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51857 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51858 assert(MulCLowBitLog != -1 &&
51859 "Isolated lowbit is somehow not a power of 2!");
51860 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51861 DAG.getConstant(MulCLowBitLog, dl, VT));
51862 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51863 }
51864 }
51865 }
51866
51867 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51868 return SetCC;
51869
51870 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51871 return V;
51872
51873 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51874 return R;
51875
51876 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51877 return R;
51878
51879 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51880 return R;
51881
51882 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51883 DAG, DCI, Subtarget))
51884 return FPLogic;
51885
51886 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51887 return R;
51888
51889 if (DCI.isBeforeLegalizeOps())
51890 return SDValue();
51891
51892 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51893 return R;
51894
51895 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51896 return R;
51897
51898 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51899 return ShiftRight;
51900
51901 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51902 return R;
51903
51904 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51905 return R;
51906
51907 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51908 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51909 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51910 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51911 unsigned Opc0 = N0.getOpcode();
51912 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51914 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51915 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51916 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51917 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51918 }
51919 }
51920
51921 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51922 // to make use of predicated selects.
51923 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51924 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51925 SDValue X, Y;
51926 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51927 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51928 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51929 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51932 m_Value(Y), m_SpecificVT(CondVT),
51933 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51934 return DAG.getSelect(dl, VT, Y, X,
51935 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51936 }
51937 }
51938
51939 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51940 // avoids slow variable shift (moving shift amount to ECX etc.)
51941 if (isOneConstant(N1) && N0->hasOneUse()) {
51942 SDValue Src = N0;
51943 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51944 Src.getOpcode() == ISD::TRUNCATE) &&
51945 Src.getOperand(0)->hasOneUse())
51946 Src = Src.getOperand(0);
51947 bool ContainsNOT = false;
51948 X86::CondCode X86CC = X86::COND_B;
51949 // Peek through AND(NOT(SRL(X,Y)),1).
51950 if (isBitwiseNot(Src)) {
51951 Src = Src.getOperand(0);
51952 X86CC = X86::COND_AE;
51953 ContainsNOT = true;
51954 }
51955 if (Src.getOpcode() == ISD::SRL &&
51956 !isa<ConstantSDNode>(Src.getOperand(1))) {
51957 SDValue BitNo = Src.getOperand(1);
51958 Src = Src.getOperand(0);
51959 // Peek through AND(SRL(NOT(X),Y),1).
51960 if (isBitwiseNot(Src)) {
51961 Src = Src.getOperand(0);
51962 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51963 ContainsNOT = true;
51964 }
51965 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51966 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51967 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51968 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51969 }
51970 }
51971
51972 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51973 // Attempt to recursively combine a bitmask AND with shuffles.
51974 SDValue Op(N, 0);
51975 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51976 return Res;
51977
51978 // If either operand is a constant mask, then only the elements that aren't
51979 // zero are actually demanded by the other operand.
51980 auto GetDemandedMasks = [&](SDValue Op) {
51981 APInt UndefElts;
51982 SmallVector<APInt> EltBits;
51983 int NumElts = VT.getVectorNumElements();
51984 int EltSizeInBits = VT.getScalarSizeInBits();
51985 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51986 APInt DemandedElts = APInt::getAllOnes(NumElts);
51987 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51988 EltBits)) {
51989 DemandedBits.clearAllBits();
51990 DemandedElts.clearAllBits();
51991 for (int I = 0; I != NumElts; ++I) {
51992 if (UndefElts[I]) {
51993 // We can't assume an undef src element gives an undef dst - the
51994 // other src might be zero.
51995 DemandedBits.setAllBits();
51996 DemandedElts.setBit(I);
51997 } else if (!EltBits[I].isZero()) {
51998 DemandedBits |= EltBits[I];
51999 DemandedElts.setBit(I);
52000 }
52001 }
52002 }
52003 return std::make_pair(DemandedBits, DemandedElts);
52004 };
52005 APInt Bits0, Elts0;
52006 APInt Bits1, Elts1;
52007 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52008 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52009
52010 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52011 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52012 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52013 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52014 if (N->getOpcode() != ISD::DELETED_NODE)
52015 DCI.AddToWorklist(N);
52016 return SDValue(N, 0);
52017 }
52018
52019 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52020 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52021 if (NewN0 || NewN1)
52022 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52023 NewN1 ? NewN1 : N1);
52024 }
52025
52026 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52027 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52029 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52030 SDValue BitMask = N1;
52031 SDValue SrcVec = N0.getOperand(0);
52032 EVT SrcVecVT = SrcVec.getValueType();
52033
52034 // Check that the constant bitmask masks whole bytes.
52035 APInt UndefElts;
52036 SmallVector<APInt, 64> EltBits;
52037 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52038 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52039 llvm::all_of(EltBits, [](const APInt &M) {
52040 return M.isZero() || M.isAllOnes();
52041 })) {
52042 unsigned NumElts = SrcVecVT.getVectorNumElements();
52043 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52044 unsigned Idx = N0.getConstantOperandVal(1);
52045
52046 // Create a root shuffle mask from the byte mask and the extracted index.
52047 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52048 for (unsigned i = 0; i != Scale; ++i) {
52049 if (UndefElts[i])
52050 continue;
52051 int VecIdx = Scale * Idx + i;
52052 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52053 }
52054
52056 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52057 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52058 /*AllowVariableCrossLaneMask=*/true,
52059 /*AllowVariablePerLaneMask=*/true,
52060 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52061 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52062 N0.getOperand(1));
52063 }
52064 }
52065
52066 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52067 return R;
52068
52069 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52070 return R;
52071
52072 return SDValue();
52073}
52074
52075// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52077 SelectionDAG &DAG,
52078 const X86Subtarget &Subtarget) {
52079 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52080
52081 MVT VT = N->getSimpleValueType(0);
52082 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52083 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52084 return SDValue();
52085
52086 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52087 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52088 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52089 return SDValue();
52090
52091 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52092 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52093 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52094 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52095 return SDValue();
52096
52097 // Attempt to extract constant byte masks.
52098 APInt UndefElts0, UndefElts1;
52099 SmallVector<APInt, 32> EltBits0, EltBits1;
52100 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52101 /*AllowWholeUndefs*/ false,
52102 /*AllowPartialUndefs*/ false))
52103 return SDValue();
52104 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52105 /*AllowWholeUndefs*/ false,
52106 /*AllowPartialUndefs*/ false))
52107 return SDValue();
52108
52109 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52110 // TODO - add UNDEF elts support.
52111 if (UndefElts0[i] || UndefElts1[i])
52112 return SDValue();
52113 if (EltBits0[i] != ~EltBits1[i])
52114 return SDValue();
52115 }
52116
52117 if (useVPTERNLOG(Subtarget, VT)) {
52118 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52119 // VPTERNLOG is only available as vXi32/64-bit types.
52120 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52121 MVT OpVT =
52122 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52123 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52124 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52125 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52126 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52127 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52128 DAG, Subtarget);
52129 return DAG.getBitcast(VT, Res);
52130 }
52131
52132 SDValue X = N->getOperand(0);
52133 SDValue Y =
52134 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52135 DAG.getBitcast(VT, N1.getOperand(0)));
52136 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52137}
52138
52139// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52140// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52141// Waiting for ANDNP combine allows other combines to happen that prevent
52142// matching.
52143static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52144 using namespace SDPatternMatch;
52145 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52146 m_And(m_Deferred(Mask), m_Value(Y))));
52147}
52148
52149// Try to fold:
52150// (or (and (m, y), (pandn m, x)))
52151// into:
52152// (vselect m, x, y)
52153// As a special case, try to fold:
52154// (or (and (m, (sub 0, x)), (pandn m, x)))
52155// into:
52156// (sub (xor X, M), M)
52158 SelectionDAG &DAG,
52159 const X86Subtarget &Subtarget) {
52160 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52161
52162 EVT VT = N->getValueType(0);
52163 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52164 (VT.is256BitVector() && Subtarget.hasInt256())))
52165 return SDValue();
52166
52167 SDValue X, Y, Mask;
52168 if (!matchLogicBlend(N, X, Y, Mask))
52169 return SDValue();
52170
52171 // Validate that X, Y, and Mask are bitcasts, and see through them.
52172 Mask = peekThroughBitcasts(Mask);
52175
52176 EVT MaskVT = Mask.getValueType();
52177 unsigned EltBits = MaskVT.getScalarSizeInBits();
52178
52179 // TODO: Attempt to handle floating point cases as well?
52180 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52181 return SDValue();
52182
52183 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52184 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52185 DAG, Subtarget))
52186 return Res;
52187
52188 // PBLENDVB is only available on SSE 4.1.
52189 if (!Subtarget.hasSSE41())
52190 return SDValue();
52191
52192 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52193 if (Subtarget.hasVLX())
52194 return SDValue();
52195
52196 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52197
52198 X = DAG.getBitcast(BlendVT, X);
52199 Y = DAG.getBitcast(BlendVT, Y);
52200 Mask = DAG.getBitcast(BlendVT, Mask);
52201 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52202 return DAG.getBitcast(VT, Mask);
52203}
52204
52205// Helper function for combineOrCmpEqZeroToCtlzSrl
52206// Transforms:
52207// seteq(cmp x, 0)
52208// into:
52209// srl(ctlz x), log2(bitsize(x))
52210// Input pattern is checked by caller.
52212 SDValue Cmp = Op.getOperand(1);
52213 EVT VT = Cmp.getOperand(0).getValueType();
52214 unsigned Log2b = Log2_32(VT.getSizeInBits());
52215 SDLoc dl(Op);
52216 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52217 // The result of the shift is true or false, and on X86, the 32-bit
52218 // encoding of shr and lzcnt is more desirable.
52219 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52220 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52221 DAG.getConstant(Log2b, dl, MVT::i8));
52222 return Scc;
52223}
52224
52225// Try to transform:
52226// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52227// into:
52228// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52229// Will also attempt to match more generic cases, eg:
52230// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52231// Only applies if the target supports the FastLZCNT feature.
52234 const X86Subtarget &Subtarget) {
52235 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52236 return SDValue();
52237
52238 auto isORCandidate = [](SDValue N) {
52239 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52240 };
52241
52242 // Check the zero extend is extending to 32-bit or more. The code generated by
52243 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52244 // instructions to clear the upper bits.
52245 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52246 !isORCandidate(N->getOperand(0)))
52247 return SDValue();
52248
52249 // Check the node matches: setcc(eq, cmp 0)
52250 auto isSetCCCandidate = [](SDValue N) {
52251 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52252 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52253 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52254 isNullConstant(N->getOperand(1).getOperand(1)) &&
52255 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52256 };
52257
52258 SDNode *OR = N->getOperand(0).getNode();
52259 SDValue LHS = OR->getOperand(0);
52260 SDValue RHS = OR->getOperand(1);
52261
52262 // Save nodes matching or(or, setcc(eq, cmp 0)).
52264 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52265 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52266 ORNodes.push_back(OR);
52267 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52268 LHS = OR->getOperand(0);
52269 RHS = OR->getOperand(1);
52270 }
52271
52272 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52273 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52274 !isORCandidate(SDValue(OR, 0)))
52275 return SDValue();
52276
52277 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52278 // to
52279 // or(srl(ctlz),srl(ctlz)).
52280 // The dag combiner can then fold it into:
52281 // srl(or(ctlz, ctlz)).
52282 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52283 SDValue Ret, NewRHS;
52284 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52285 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52286
52287 if (!Ret)
52288 return SDValue();
52289
52290 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52291 while (!ORNodes.empty()) {
52292 OR = ORNodes.pop_back_val();
52293 LHS = OR->getOperand(0);
52294 RHS = OR->getOperand(1);
52295 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52296 if (RHS->getOpcode() == ISD::OR)
52297 std::swap(LHS, RHS);
52298 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52299 if (!NewRHS)
52300 return SDValue();
52301 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52302 }
52303
52304 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52305}
52306
52307/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52308/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52309/// with CMP+{ADC, SBB}.
52310/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52311static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52312 SDValue X, SDValue Y,
52313 SelectionDAG &DAG,
52314 bool ZeroSecondOpOnly = false) {
52315 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52316 return SDValue();
52317
52318 // Look through a one-use zext.
52319 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52320 Y = Y.getOperand(0);
52321
52322 X86::CondCode CC;
52323 SDValue EFLAGS;
52324 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52325 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52326 EFLAGS = Y.getOperand(1);
52327 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52328 Y.hasOneUse()) {
52329 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52330 }
52331
52332 if (!EFLAGS)
52333 return SDValue();
52334
52335 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52336 // the general case below.
52337 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52338 if (ConstantX && !ZeroSecondOpOnly) {
52339 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52340 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52341 // This is a complicated way to get -1 or 0 from the carry flag:
52342 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52343 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52344 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52345 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52346 EFLAGS);
52347 }
52348
52349 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52350 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52351 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52352 EFLAGS.getValueType().isInteger() &&
52353 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52354 // Swap the operands of a SUB, and we have the same pattern as above.
52355 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52356 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52357 SDValue NewSub = DAG.getNode(
52358 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52359 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52360 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52361 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52362 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52363 NewEFLAGS);
52364 }
52365 }
52366 }
52367
52368 if (CC == X86::COND_B) {
52369 // X + SETB Z --> adc X, 0
52370 // X - SETB Z --> sbb X, 0
52371 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52372 DAG.getVTList(VT, MVT::i32), X,
52373 DAG.getConstant(0, DL, VT), EFLAGS);
52374 }
52375
52376 if (ZeroSecondOpOnly)
52377 return SDValue();
52378
52379 if (CC == X86::COND_A) {
52380 // Try to convert COND_A into COND_B in an attempt to facilitate
52381 // materializing "setb reg".
52382 //
52383 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52384 // cannot take an immediate as its first operand.
52385 //
52386 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52387 EFLAGS.getValueType().isInteger() &&
52388 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52389 SDValue NewSub =
52390 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52391 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52392 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52393 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52394 DAG.getVTList(VT, MVT::i32), X,
52395 DAG.getConstant(0, DL, VT), NewEFLAGS);
52396 }
52397 }
52398
52399 if (CC == X86::COND_AE) {
52400 // X + SETAE --> sbb X, -1
52401 // X - SETAE --> adc X, -1
52402 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52403 DAG.getVTList(VT, MVT::i32), X,
52404 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52405 }
52406
52407 if (CC == X86::COND_BE) {
52408 // X + SETBE --> sbb X, -1
52409 // X - SETBE --> adc X, -1
52410 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52411 // materializing "setae reg".
52412 //
52413 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52414 // cannot take an immediate as its first operand.
52415 //
52416 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52417 EFLAGS.getValueType().isInteger() &&
52418 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52419 SDValue NewSub =
52420 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52421 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52422 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52423 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52424 DAG.getVTList(VT, MVT::i32), X,
52425 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52426 }
52427 }
52428
52429 if (CC != X86::COND_E && CC != X86::COND_NE)
52430 return SDValue();
52431
52432 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52433 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52434 !EFLAGS.getOperand(0).getValueType().isInteger())
52435 return SDValue();
52436
52437 SDValue Z = EFLAGS.getOperand(0);
52438 EVT ZVT = Z.getValueType();
52439
52440 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52441 // the general case below.
52442 if (ConstantX) {
52443 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52444 // fake operands:
52445 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52446 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52447 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52448 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52449 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52450 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52451 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52452 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52453 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52454 SDValue(Neg.getNode(), 1));
52455 }
52456
52457 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52458 // with fake operands:
52459 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52460 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52461 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52462 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52463 SDValue One = DAG.getConstant(1, DL, ZVT);
52464 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52465 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52466 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52467 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52468 Cmp1.getValue(1));
52469 }
52470 }
52471
52472 // (cmp Z, 1) sets the carry flag if Z is 0.
52473 SDValue One = DAG.getConstant(1, DL, ZVT);
52474 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52475 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52476
52477 // Add the flags type for ADC/SBB nodes.
52478 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52479
52480 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52481 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52482 if (CC == X86::COND_NE)
52483 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52484 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52485
52486 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52487 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52488 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52489 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52490}
52491
52492/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52493/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52494/// with CMP+{ADC, SBB}.
52496 SelectionDAG &DAG) {
52497 bool IsSub = N->getOpcode() == ISD::SUB;
52498 SDValue X = N->getOperand(0);
52499 SDValue Y = N->getOperand(1);
52500 EVT VT = N->getValueType(0);
52501
52502 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52503 return ADCOrSBB;
52504
52505 // Commute and try again (negate the result for subtracts).
52506 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52507 if (IsSub)
52508 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52509 return ADCOrSBB;
52510 }
52511
52512 return SDValue();
52513}
52514
52515static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52516 SDValue N0, SDValue N1,
52517 SelectionDAG &DAG) {
52518 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52519
52520 // Delegate to combineAddOrSubToADCOrSBB if we have:
52521 //
52522 // (xor/or (zero_extend (setcc)) imm)
52523 //
52524 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52525 // equivalent to a SUB/ADD, respectively.
52526 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52527 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52528 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52529 bool IsSub = Opc == ISD::XOR;
52530 bool N1COdd = N1C->getZExtValue() & 1;
52531 if (IsSub ? N1COdd : !N1COdd)
52532 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52533 return R;
52534 }
52535 }
52536
52537 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52538 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52539 N0.getOperand(0).getOpcode() == ISD::AND &&
52543 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52544 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52545 N0.getOperand(0).getOperand(1));
52546 }
52547
52548 return SDValue();
52549}
52550
52553 const X86Subtarget &Subtarget) {
52554 SDValue N0 = N->getOperand(0);
52555 SDValue N1 = N->getOperand(1);
52556 EVT VT = N->getValueType(0);
52557 SDLoc dl(N);
52558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52559
52560 // If this is SSE1 only convert to FOR to avoid scalarization.
52561 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52562 return DAG.getBitcast(MVT::v4i32,
52563 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52564 DAG.getBitcast(MVT::v4f32, N0),
52565 DAG.getBitcast(MVT::v4f32, N1)));
52566 }
52567
52568 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52569 // TODO: Support multiple SrcOps.
52570 if (VT == MVT::i1) {
52572 SmallVector<APInt, 2> SrcPartials;
52573 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52574 SrcOps.size() == 1) {
52575 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52576 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52577 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52578 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52579 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52580 if (Mask) {
52581 assert(SrcPartials[0].getBitWidth() == NumElts &&
52582 "Unexpected partial reduction mask");
52583 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52584 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52585 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52586 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52587 }
52588 }
52589 }
52590
52591 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52592 return SetCC;
52593
52594 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52595 return R;
52596
52597 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52598 return R;
52599
52600 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52601 return R;
52602
52603 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52604 DAG, DCI, Subtarget))
52605 return FPLogic;
52606
52607 if (DCI.isBeforeLegalizeOps())
52608 return SDValue();
52609
52610 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52611 return R;
52612
52613 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52614 return R;
52615
52616 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52617 return R;
52618
52619 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52620 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52621 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52622 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52623 uint64_t Val = CN->getZExtValue();
52624 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52625 Val == 8) {
52626 SDValue NotCond;
52627 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52628 N0.getOperand(1).hasOneUse()) {
52631 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52632 } else if (N0.getOpcode() == ISD::SUB &&
52633 isNullConstant(N0.getOperand(0))) {
52634 SDValue Cond = N0.getOperand(1);
52635 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52636 Cond = Cond.getOperand(0);
52637 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52638 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52640 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52641 }
52642 }
52643
52644 if (NotCond) {
52645 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52646 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52647 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52648 return R;
52649 }
52650 }
52651 }
52652 }
52653
52654 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52655 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52656 // iff the upper elements of the non-shifted arg are zero.
52657 // KUNPCK require 16+ bool vector elements.
52658 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52659 unsigned NumElts = VT.getVectorNumElements();
52660 unsigned HalfElts = NumElts / 2;
52661 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52662 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52663 N1.getConstantOperandAPInt(1) == HalfElts &&
52664 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52665 return DAG.getNode(
52666 ISD::CONCAT_VECTORS, dl, VT,
52667 extractSubVector(N0, 0, DAG, dl, HalfElts),
52668 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52669 }
52670 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52671 N0.getConstantOperandAPInt(1) == HalfElts &&
52672 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52673 return DAG.getNode(
52674 ISD::CONCAT_VECTORS, dl, VT,
52675 extractSubVector(N1, 0, DAG, dl, HalfElts),
52676 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52677 }
52678 }
52679
52680 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52681 // Attempt to recursively combine an OR of shuffles.
52682 SDValue Op(N, 0);
52683 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52684 return Res;
52685
52686 // If either operand is a constant mask, then only the elements that aren't
52687 // allones are actually demanded by the other operand.
52688 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52689 APInt UndefElts;
52690 SmallVector<APInt> EltBits;
52691 int NumElts = VT.getVectorNumElements();
52692 int EltSizeInBits = VT.getScalarSizeInBits();
52693 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52694 return false;
52695
52696 APInt DemandedElts = APInt::getZero(NumElts);
52697 for (int I = 0; I != NumElts; ++I)
52698 if (!EltBits[I].isAllOnes())
52699 DemandedElts.setBit(I);
52700
52701 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52702 };
52703 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52704 if (N->getOpcode() != ISD::DELETED_NODE)
52705 DCI.AddToWorklist(N);
52706 return SDValue(N, 0);
52707 }
52708 }
52709
52710 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52711 return R;
52712
52713 return SDValue();
52714}
52715
52716/// Try to turn tests against the signbit in the form of:
52717/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52718/// into:
52719/// SETGT(X, -1)
52721 SelectionDAG &DAG) {
52722 // This is only worth doing if the output type is i8 or i1.
52723 EVT ResultType = N->getValueType(0);
52724 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52725 return SDValue();
52726
52727 SDValue N0 = N->getOperand(0);
52728 SDValue N1 = N->getOperand(1);
52729
52730 // We should be performing an xor against a truncated shift.
52731 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52732 return SDValue();
52733
52734 // Make sure we are performing an xor against one.
52735 if (!isOneConstant(N1))
52736 return SDValue();
52737
52738 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52739 SDValue Shift = N0.getOperand(0);
52740 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52741 return SDValue();
52742
52743 // Make sure we are truncating from one of i16, i32 or i64.
52744 EVT ShiftTy = Shift.getValueType();
52745 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52746 return SDValue();
52747
52748 // Make sure the shift amount extracts the sign bit.
52749 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52750 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52751 return SDValue();
52752
52753 // Create a greater-than comparison against -1.
52754 // N.B. Using SETGE against 0 works but we want a canonical looking
52755 // comparison, using SETGT matches up with what TranslateX86CC.
52756 SDValue ShiftOp = Shift.getOperand(0);
52757 EVT ShiftOpTy = ShiftOp.getValueType();
52758 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52759 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52760 *DAG.getContext(), ResultType);
52761 SDValue Cond =
52762 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52763 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52764 if (SetCCResultType != ResultType)
52765 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52766 return Cond;
52767}
52768
52769/// Turn vector tests of the signbit in the form of:
52770/// xor (sra X, elt_size(X)-1), -1
52771/// into:
52772/// pcmpgt X, -1
52773///
52774/// This should be called before type legalization because the pattern may not
52775/// persist after that.
52777 const X86Subtarget &Subtarget) {
52778 EVT VT = N->getValueType(0);
52779 if (!VT.isSimple())
52780 return SDValue();
52781
52782 switch (VT.getSimpleVT().SimpleTy) {
52783 // clang-format off
52784 default: return SDValue();
52785 case MVT::v16i8:
52786 case MVT::v8i16:
52787 case MVT::v4i32:
52788 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52789 case MVT::v32i8:
52790 case MVT::v16i16:
52791 case MVT::v8i32:
52792 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52793 // clang-format on
52794 }
52795
52796 // There must be a shift right algebraic before the xor, and the xor must be a
52797 // 'not' operation.
52798 SDValue Shift = N->getOperand(0);
52799 SDValue Ones = N->getOperand(1);
52800 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52802 return SDValue();
52803
52804 // The shift should be smearing the sign bit across each vector element.
52805 auto *ShiftAmt =
52806 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52807 if (!ShiftAmt ||
52808 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52809 return SDValue();
52810
52811 // Create a greater-than comparison against -1. We don't use the more obvious
52812 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52813 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52814}
52815
52816/// Detect patterns of truncation with unsigned saturation:
52817///
52818/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52819/// Return the source value x to be truncated or SDValue() if the pattern was
52820/// not matched.
52821///
52822/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52823/// where C1 >= 0 and C2 is unsigned max of destination type.
52824///
52825/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52826/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52827///
52828/// These two patterns are equivalent to:
52829/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52830/// So return the smax(x, C1) value to be truncated or SDValue() if the
52831/// pattern was not matched.
52833 const SDLoc &DL) {
52834 using namespace llvm::SDPatternMatch;
52835 EVT InVT = In.getValueType();
52836
52837 // Saturation with truncation. We truncate from InVT to VT.
52839 "Unexpected types for truncate operation");
52840
52841 APInt C1, C2;
52843
52844 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52845 // the element size of the destination type.
52846 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52847 C2.isMask(VT.getScalarSizeInBits()))
52848 return UMin;
52849
52850 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52852 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52853 return SMin;
52854
52855 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52857 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52858 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52859
52860 return SDValue();
52861}
52862
52863/// Detect patterns of truncation with signed saturation:
52864/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52865/// signed_max_of_dest_type)) to dest_type)
52866/// or:
52867/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52868/// signed_min_of_dest_type)) to dest_type).
52869/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52870/// Return the source value to be truncated or SDValue() if the pattern was not
52871/// matched.
52872static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52873 using namespace llvm::SDPatternMatch;
52874 unsigned NumDstBits = VT.getScalarSizeInBits();
52875 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52876 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52877
52878 APInt SignedMax, SignedMin;
52879 if (MatchPackUS) {
52880 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52881 SignedMin = APInt::getZero(NumSrcBits);
52882 } else {
52883 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52884 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52885 }
52886
52887 SDValue SMin, SMax;
52888 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52889 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52890 return SMax;
52891
52892 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52893 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52894 return SMin;
52895
52896 return SDValue();
52897}
52898
52900 SelectionDAG &DAG,
52901 const X86Subtarget &Subtarget) {
52902 if (!Subtarget.hasSSE2() || !VT.isVector())
52903 return SDValue();
52904
52905 EVT SVT = VT.getVectorElementType();
52906 EVT InVT = In.getValueType();
52907 EVT InSVT = InVT.getVectorElementType();
52908
52909 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52910 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52911 // and concatenate at the same time. Then we can use a final vpmovuswb to
52912 // clip to 0-255.
52913 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52914 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52915 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52916 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52917 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52918 DL, DAG, Subtarget);
52919 assert(Mid && "Failed to pack!");
52920 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52921 }
52922 }
52923
52924 // vXi32 truncate instructions are available with AVX512F.
52925 // vXi16 truncate instructions are only available with AVX512BW.
52926 // For 256-bit or smaller vectors, we require VLX.
52927 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52928 // If the result type is 256-bits or larger and we have disable 512-bit
52929 // registers, we should go ahead and use the pack instructions if possible.
52930 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52931 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52932 (InVT.getSizeInBits() > 128) &&
52933 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52934 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52935
52936 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52938 (SVT == MVT::i8 || SVT == MVT::i16) &&
52939 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52940 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52941 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52942 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52943 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52944 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52945 DAG, Subtarget);
52946 assert(Mid && "Failed to pack!");
52948 Subtarget);
52949 assert(V && "Failed to pack!");
52950 return V;
52951 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52952 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52953 Subtarget);
52954 }
52955 if (SDValue SSatVal = detectSSatPattern(In, VT))
52956 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52957 Subtarget);
52958 }
52959
52960 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52961 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52962 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52963 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52964 unsigned TruncOpc = 0;
52965 SDValue SatVal;
52966 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52967 SatVal = SSatVal;
52968 TruncOpc = X86ISD::VTRUNCS;
52969 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52970 SatVal = USatVal;
52971 TruncOpc = X86ISD::VTRUNCUS;
52972 }
52973 if (SatVal) {
52974 unsigned ResElts = VT.getVectorNumElements();
52975 // If the input type is less than 512 bits and we don't have VLX, we need
52976 // to widen to 512 bits.
52977 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52978 unsigned NumConcats = 512 / InVT.getSizeInBits();
52979 ResElts *= NumConcats;
52980 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52981 ConcatOps[0] = SatVal;
52982 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52983 NumConcats * InVT.getVectorNumElements());
52984 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52985 }
52986 // Widen the result if its narrower than 128 bits.
52987 if (ResElts * SVT.getSizeInBits() < 128)
52988 ResElts = 128 / SVT.getSizeInBits();
52989 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52990 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52991 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52992 DAG.getVectorIdxConstant(0, DL));
52993 }
52994 }
52995
52996 return SDValue();
52997}
52998
53000 SelectionDAG &DAG,
53002 const X86Subtarget &Subtarget) {
53003 auto *Ld = cast<LoadSDNode>(N);
53004 EVT RegVT = Ld->getValueType(0);
53005 SDValue Ptr = Ld->getBasePtr();
53006 SDValue Chain = Ld->getChain();
53007 ISD::LoadExtType Ext = Ld->getExtensionType();
53008
53009 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53010 return SDValue();
53011
53012 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53013 return SDValue();
53014
53016 if (!LdC)
53017 return SDValue();
53018
53019 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53020 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53021 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53022 if (Undefs[I])
53023 continue;
53024 if (UserUndefs[I] || Bits[I] != UserBits[I])
53025 return false;
53026 }
53027 return true;
53028 };
53029
53030 // Look through all other loads/broadcasts in the chain for another constant
53031 // pool entry.
53032 for (SDNode *User : Chain->users()) {
53033 auto *UserLd = dyn_cast<MemSDNode>(User);
53034 if (User != N && UserLd &&
53035 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53036 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53038 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53039 User->getValueSizeInBits(0).getFixedValue() >
53040 RegVT.getFixedSizeInBits()) {
53041 EVT UserVT = User->getValueType(0);
53042 SDValue UserPtr = UserLd->getBasePtr();
53043 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53044
53045 // See if we are loading a constant that matches in the lower
53046 // bits of a longer constant (but from a different constant pool ptr).
53047 if (UserC && UserPtr != Ptr) {
53048 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53049 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53050 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53051 APInt Undefs, UserUndefs;
53052 SmallVector<APInt> Bits, UserBits;
53053 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53054 UserVT.getScalarSizeInBits());
53055 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53056 Bits) &&
53058 UserUndefs, UserBits)) {
53059 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53061 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53062 RegVT.getSizeInBits());
53063 Extract = DAG.getBitcast(RegVT, Extract);
53064 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53065 }
53066 }
53067 }
53068 }
53069 }
53070 }
53071
53072 return SDValue();
53073}
53074
53077 const X86Subtarget &Subtarget) {
53078 auto *Ld = cast<LoadSDNode>(N);
53079 EVT RegVT = Ld->getValueType(0);
53080 EVT MemVT = Ld->getMemoryVT();
53081 SDLoc dl(Ld);
53082 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53083
53084 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53085 // into two 16-byte operations. Also split non-temporal aligned loads on
53086 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53087 ISD::LoadExtType Ext = Ld->getExtensionType();
53088 unsigned Fast;
53089 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53090 Ext == ISD::NON_EXTLOAD &&
53091 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53092 Ld->getAlign() >= Align(16)) ||
53093 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53094 *Ld->getMemOperand(), &Fast) &&
53095 !Fast))) {
53096 unsigned NumElems = RegVT.getVectorNumElements();
53097 if (NumElems < 2)
53098 return SDValue();
53099
53100 unsigned HalfOffset = 16;
53101 SDValue Ptr1 = Ld->getBasePtr();
53102 SDValue Ptr2 =
53103 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53104 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53105 NumElems / 2);
53106 SDValue Load1 =
53107 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53108 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53109 SDValue Load2 =
53110 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53111 Ld->getPointerInfo().getWithOffset(HalfOffset),
53112 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53113 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53114 Load1.getValue(1), Load2.getValue(1));
53115
53116 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53117 return DCI.CombineTo(N, NewVec, TF, true);
53118 }
53119
53120 // Bool vector load - attempt to cast to an integer, as we have good
53121 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53122 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53123 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53124 unsigned NumElts = RegVT.getVectorNumElements();
53125 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53126 if (TLI.isTypeLegal(IntVT)) {
53127 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53128 Ld->getPointerInfo(), Ld->getBaseAlign(),
53129 Ld->getMemOperand()->getFlags());
53130 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53131 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53132 }
53133 }
53134
53135 // If we also broadcast this vector to a wider type, then just extract the
53136 // lowest subvector.
53137 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53138 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53139 SDValue Ptr = Ld->getBasePtr();
53140 SDValue Chain = Ld->getChain();
53141 for (SDNode *User : Chain->users()) {
53142 auto *UserLd = dyn_cast<MemSDNode>(User);
53143 if (User != N && UserLd &&
53144 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53145 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53146 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53147 User->hasAnyUseOfValue(0) &&
53148 User->getValueSizeInBits(0).getFixedValue() >
53149 RegVT.getFixedSizeInBits()) {
53151 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53152 RegVT.getSizeInBits());
53153 Extract = DAG.getBitcast(RegVT, Extract);
53154 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53155 }
53156 }
53157 }
53158
53159 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53160 return V;
53161
53162 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53163 unsigned AddrSpace = Ld->getAddressSpace();
53164 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53165 AddrSpace == X86AS::PTR32_UPTR) {
53166 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53167 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53168 SDValue Cast =
53169 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53170 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53171 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53172 Ld->getMemOperand()->getFlags());
53173 }
53174 }
53175
53176 return SDValue();
53177}
53178
53179/// If V is a build vector of boolean constants and exactly one of those
53180/// constants is true, return the operand index of that true element.
53181/// Otherwise, return -1.
53182static int getOneTrueElt(SDValue V) {
53183 // This needs to be a build vector of booleans.
53184 // TODO: Checking for the i1 type matches the IR definition for the mask,
53185 // but the mask check could be loosened to i8 or other types. That might
53186 // also require checking more than 'allOnesValue'; eg, the x86 HW
53187 // instructions only require that the MSB is set for each mask element.
53188 // The ISD::MSTORE comments/definition do not specify how the mask operand
53189 // is formatted.
53190 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53191 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53192 return -1;
53193
53194 int TrueIndex = -1;
53195 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53196 for (unsigned i = 0; i < NumElts; ++i) {
53197 const SDValue &Op = BV->getOperand(i);
53198 if (Op.isUndef())
53199 continue;
53200 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53201 if (!ConstNode)
53202 return -1;
53203 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53204 // If we already found a one, this is too many.
53205 if (TrueIndex >= 0)
53206 return -1;
53207 TrueIndex = i;
53208 }
53209 }
53210 return TrueIndex;
53211}
53212
53213/// Given a masked memory load/store operation, return true if it has one mask
53214/// bit set. If it has one mask bit set, then also return the memory address of
53215/// the scalar element to load/store, the vector index to insert/extract that
53216/// scalar element, and the alignment for the scalar memory access.
53218 SelectionDAG &DAG, SDValue &Addr,
53219 SDValue &Index, Align &Alignment,
53220 unsigned &Offset) {
53221 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53222 if (TrueMaskElt < 0)
53223 return false;
53224
53225 // Get the address of the one scalar element that is specified by the mask
53226 // using the appropriate offset from the base pointer.
53227 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53228 Offset = 0;
53229 Addr = MaskedOp->getBasePtr();
53230 if (TrueMaskElt != 0) {
53231 Offset = TrueMaskElt * EltVT.getStoreSize();
53233 SDLoc(MaskedOp));
53234 }
53235
53236 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53237 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53238 return true;
53239}
53240
53241/// If exactly one element of the mask is set for a non-extending masked load,
53242/// it is a scalar load and vector insert.
53243/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53244/// mask have already been optimized in IR, so we don't bother with those here.
53245static SDValue
53248 const X86Subtarget &Subtarget) {
53249 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53250 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53251 // However, some target hooks may need to be added to know when the transform
53252 // is profitable. Endianness would also have to be considered.
53253
53254 SDValue Addr, VecIndex;
53255 Align Alignment;
53256 unsigned Offset;
53257 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53258 return SDValue();
53259
53260 // Load the one scalar element that is specified by the mask using the
53261 // appropriate offset from the base pointer.
53262 SDLoc DL(ML);
53263 EVT VT = ML->getValueType(0);
53264 EVT EltVT = VT.getVectorElementType();
53265
53266 EVT CastVT = VT;
53267 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53268 EltVT = MVT::f64;
53269 CastVT = VT.changeVectorElementType(EltVT);
53270 }
53271
53272 SDValue Load =
53273 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53274 ML->getPointerInfo().getWithOffset(Offset),
53275 Alignment, ML->getMemOperand()->getFlags());
53276
53277 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53278
53279 // Insert the loaded element into the appropriate place in the vector.
53280 SDValue Insert =
53281 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53282 Insert = DAG.getBitcast(VT, Insert);
53283 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53284}
53285
53286static SDValue
53289 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53290 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53291 return SDValue();
53292
53293 SDLoc DL(ML);
53294 EVT VT = ML->getValueType(0);
53295
53296 // If we are loading the first and last elements of a vector, it is safe and
53297 // always faster to load the whole vector. Replace the masked load with a
53298 // vector load and select.
53299 unsigned NumElts = VT.getVectorNumElements();
53300 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53301 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53302 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53303 if (LoadFirstElt && LoadLastElt) {
53304 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53305 ML->getMemOperand());
53306 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53307 ML->getPassThru());
53308 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53309 }
53310
53311 // Convert a masked load with a constant mask into a masked load and a select.
53312 // This allows the select operation to use a faster kind of select instruction
53313 // (for example, vblendvps -> vblendps).
53314
53315 // Don't try this if the pass-through operand is already undefined. That would
53316 // cause an infinite loop because that's what we're about to create.
53317 if (ML->getPassThru().isUndef())
53318 return SDValue();
53319
53320 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53321 return SDValue();
53322
53323 // The new masked load has an undef pass-through operand. The select uses the
53324 // original pass-through operand.
53325 SDValue NewML = DAG.getMaskedLoad(
53326 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53327 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53328 ML->getAddressingMode(), ML->getExtensionType());
53329 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53330 ML->getPassThru());
53331
53332 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53333}
53334
53337 const X86Subtarget &Subtarget) {
53338 auto *Mld = cast<MaskedLoadSDNode>(N);
53339
53340 // TODO: Expanding load with constant mask may be optimized as well.
53341 if (Mld->isExpandingLoad())
53342 return SDValue();
53343
53344 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53345 if (SDValue ScalarLoad =
53346 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53347 return ScalarLoad;
53348
53349 // TODO: Do some AVX512 subsets benefit from this transform?
53350 if (!Subtarget.hasAVX512())
53351 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53352 return Blend;
53353 }
53354
53355 // If the mask value has been legalized to a non-boolean vector, try to
53356 // simplify ops leading up to it. We only demand the MSB of each lane.
53357 SDValue Mask = Mld->getMask();
53358 if (Mask.getScalarValueSizeInBits() != 1) {
53359 EVT VT = Mld->getValueType(0);
53360 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53362 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53363 if (N->getOpcode() != ISD::DELETED_NODE)
53364 DCI.AddToWorklist(N);
53365 return SDValue(N, 0);
53366 }
53367 if (SDValue NewMask =
53369 return DAG.getMaskedLoad(
53370 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53371 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53372 Mld->getAddressingMode(), Mld->getExtensionType());
53373 }
53374
53375 return SDValue();
53376}
53377
53378/// If exactly one element of the mask is set for a non-truncating masked store,
53379/// it is a vector extract and scalar store.
53380/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53381/// mask have already been optimized in IR, so we don't bother with those here.
53383 SelectionDAG &DAG,
53384 const X86Subtarget &Subtarget) {
53385 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53386 // However, some target hooks may need to be added to know when the transform
53387 // is profitable. Endianness would also have to be considered.
53388
53389 SDValue Addr, VecIndex;
53390 Align Alignment;
53391 unsigned Offset;
53392 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53393 return SDValue();
53394
53395 // Extract the one scalar element that is actually being stored.
53396 SDLoc DL(MS);
53397 SDValue Value = MS->getValue();
53398 EVT VT = Value.getValueType();
53399 EVT EltVT = VT.getVectorElementType();
53400 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53401 EltVT = MVT::f64;
53402 EVT CastVT = VT.changeVectorElementType(EltVT);
53403 Value = DAG.getBitcast(CastVT, Value);
53404 }
53405 SDValue Extract =
53406 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53407
53408 // Store that element at the appropriate offset from the base pointer.
53409 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53411 Alignment, MS->getMemOperand()->getFlags());
53412}
53413
53416 const X86Subtarget &Subtarget) {
53418 if (Mst->isCompressingStore())
53419 return SDValue();
53420
53421 EVT VT = Mst->getValue().getValueType();
53422 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53423
53424 if (Mst->isTruncatingStore())
53425 return SDValue();
53426
53427 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53428 return ScalarStore;
53429
53430 // If the mask value has been legalized to a non-boolean vector, try to
53431 // simplify ops leading up to it. We only demand the MSB of each lane.
53432 SDValue Mask = Mst->getMask();
53433 if (Mask.getScalarValueSizeInBits() != 1) {
53435 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53436 if (N->getOpcode() != ISD::DELETED_NODE)
53437 DCI.AddToWorklist(N);
53438 return SDValue(N, 0);
53439 }
53440 if (SDValue NewMask =
53442 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53443 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53444 Mst->getMemoryVT(), Mst->getMemOperand(),
53445 Mst->getAddressingMode());
53446 }
53447
53448 SDValue Value = Mst->getValue();
53449 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53450 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53451 Mst->getMemoryVT())) {
53452 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53453 Mst->getBasePtr(), Mst->getOffset(), Mask,
53454 Mst->getMemoryVT(), Mst->getMemOperand(),
53455 Mst->getAddressingMode(), true);
53456 }
53457
53458 return SDValue();
53459}
53460
53463 const X86Subtarget &Subtarget) {
53465 EVT StVT = St->getMemoryVT();
53466 SDLoc dl(St);
53467 SDValue StoredVal = St->getValue();
53468 EVT VT = StoredVal.getValueType();
53469 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53470
53471 // Convert a store of vXi1 into a store of iX and a bitcast.
53472 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53473 VT.getVectorElementType() == MVT::i1) {
53474
53476 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53477
53478 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53479 St->getPointerInfo(), St->getBaseAlign(),
53480 St->getMemOperand()->getFlags());
53481 }
53482
53483 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53484 // This will avoid a copy to k-register.
53485 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53486 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53487 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53488 SDValue Val = StoredVal.getOperand(0);
53489 // We must store zeros to the unused bits.
53490 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53491 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53492 St->getPointerInfo(), St->getBaseAlign(),
53493 St->getMemOperand()->getFlags());
53494 }
53495
53496 // Widen v2i1/v4i1 stores to v8i1.
53497 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53498 Subtarget.hasAVX512()) {
53499 unsigned NumConcats = 8 / VT.getVectorNumElements();
53500 // We must store zeros to the unused bits.
53501 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53502 Ops[0] = StoredVal;
53503 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53504 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53505 St->getPointerInfo(), St->getBaseAlign(),
53506 St->getMemOperand()->getFlags());
53507 }
53508
53509 // Turn vXi1 stores of constants into a scalar store.
53510 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53511 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53513 // If its a v64i1 store without 64-bit support, we need two stores.
53514 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53515 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53516 StoredVal->ops().slice(0, 32));
53518 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53519 StoredVal->ops().slice(32, 32));
53521
53522 SDValue Ptr0 = St->getBasePtr();
53523 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53524
53525 SDValue Ch0 =
53526 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53527 St->getBaseAlign(), St->getMemOperand()->getFlags());
53528 SDValue Ch1 = DAG.getStore(
53529 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53530 St->getBaseAlign(), St->getMemOperand()->getFlags());
53531 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53532 }
53533
53534 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53535 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53536 St->getPointerInfo(), St->getBaseAlign(),
53537 St->getMemOperand()->getFlags());
53538 }
53539
53540 // Convert scalar fabs/fneg load-store to integer equivalents.
53541 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53542 (StoredVal.getOpcode() == ISD::FABS ||
53543 StoredVal.getOpcode() == ISD::FNEG) &&
53544 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53545 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53546 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53547 if (TLI.isTypeLegal(IntVT)) {
53549 unsigned SignOp = ISD::XOR;
53550 if (StoredVal.getOpcode() == ISD::FABS) {
53551 SignMask = ~SignMask;
53552 SignOp = ISD::AND;
53553 }
53554 SDValue LogicOp = DAG.getNode(
53555 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53556 DAG.getConstant(SignMask, dl, IntVT));
53557 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53558 St->getPointerInfo(), St->getBaseAlign(),
53559 St->getMemOperand()->getFlags());
53560 }
53561 }
53562
53563 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53564 // Sandy Bridge, perform two 16-byte stores.
53565 unsigned Fast;
53566 if (VT.is256BitVector() && StVT == VT &&
53567 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53568 *St->getMemOperand(), &Fast) &&
53569 !Fast) {
53570 unsigned NumElems = VT.getVectorNumElements();
53571 if (NumElems < 2)
53572 return SDValue();
53573
53574 return splitVectorStore(St, DAG);
53575 }
53576
53577 // Split under-aligned vector non-temporal stores.
53578 if (St->isNonTemporal() && StVT == VT &&
53579 St->getAlign().value() < VT.getStoreSize()) {
53580 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53581 // vectors or the legalizer can scalarize it to use MOVNTI.
53582 if (VT.is256BitVector() || VT.is512BitVector()) {
53583 unsigned NumElems = VT.getVectorNumElements();
53584 if (NumElems < 2)
53585 return SDValue();
53586 return splitVectorStore(St, DAG);
53587 }
53588
53589 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53590 // to use MOVNTI.
53591 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53592 MVT NTVT = Subtarget.hasSSE4A()
53593 ? MVT::v2f64
53594 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53595 return scalarizeVectorStore(St, NTVT, DAG);
53596 }
53597 }
53598
53599 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53600 // supported, but avx512f is by extending to v16i32 and truncating.
53601 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53602 St->getValue().getOpcode() == ISD::TRUNCATE &&
53603 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53604 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53605 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53606 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53607 St->getValue().getOperand(0));
53608 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53609 MVT::v16i8, St->getMemOperand());
53610 }
53611
53612 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53613 if (!St->isTruncatingStore() &&
53614 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53615 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53616 StoredVal.hasOneUse() &&
53617 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53618 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53619 return EmitTruncSStore(IsSigned, St->getChain(),
53620 dl, StoredVal.getOperand(0), St->getBasePtr(),
53621 VT, St->getMemOperand(), DAG);
53622 }
53623
53624 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53625 if (!St->isTruncatingStore()) {
53626 auto IsExtractedElement = [](SDValue V) {
53627 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53628 V = V.getOperand(0);
53629 unsigned Opc = V.getOpcode();
53631 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53632 V.getOperand(0).hasOneUse())
53633 return V.getOperand(0);
53634 return SDValue();
53635 };
53636 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53637 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53638 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53639 SDValue Src = Trunc.getOperand(0);
53640 MVT DstVT = Trunc.getSimpleValueType();
53641 MVT SrcVT = Src.getSimpleValueType();
53642 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53643 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53644 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53645 if (NumTruncBits == VT.getSizeInBits() &&
53646 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53647 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53648 TruncVT, St->getMemOperand());
53649 }
53650 }
53651 }
53652 }
53653
53654 // Optimize trunc store (of multiple scalars) to shuffle and store.
53655 // First, pack all of the elements in one place. Next, store to memory
53656 // in fewer chunks.
53657 if (St->isTruncatingStore() && VT.isVector()) {
53658 if (TLI.isTruncStoreLegal(VT, StVT)) {
53659 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53660 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53661 dl, Val, St->getBasePtr(),
53662 St->getMemoryVT(), St->getMemOperand(), DAG);
53663 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53664 DAG, dl))
53665 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53666 dl, Val, St->getBasePtr(),
53667 St->getMemoryVT(), St->getMemOperand(), DAG);
53668 }
53669
53670 return SDValue();
53671 }
53672
53673 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53674 unsigned AddrSpace = St->getAddressSpace();
53675 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53676 AddrSpace == X86AS::PTR32_UPTR) {
53677 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53678 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53679 SDValue Cast =
53680 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53681 return DAG.getTruncStore(
53682 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53683 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53684 }
53685 }
53686
53687 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53688 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53689 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53690 Subtarget.hasCF() && St->isSimple()) {
53691 SDValue Cmov;
53692 if (StoredVal.getOpcode() == X86ISD::CMOV)
53693 Cmov = StoredVal;
53694 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53695 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53696 Cmov = StoredVal.getOperand(0);
53697 else
53698 return SDValue();
53699
53700 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53701 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53702 return SDValue();
53703
53704 bool InvertCC = false;
53705 SDValue V = SDValue(Ld, 0);
53706 if (V == Cmov.getOperand(1))
53707 InvertCC = true;
53708 else if (V != Cmov.getOperand(0))
53709 return SDValue();
53710
53711 SDVTList Tys = DAG.getVTList(MVT::Other);
53712 SDValue CC = Cmov.getOperand(2);
53713 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53714 if (InvertCC)
53715 CC = DAG.getTargetConstant(
53718 dl, MVT::i8);
53719 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53720 Cmov.getOperand(3)};
53721 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53722 St->getMemOperand());
53723 }
53724
53725 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53726 // the FP state in cases where an emms may be missing.
53727 // A preferable solution to the general problem is to figure out the right
53728 // places to insert EMMS. This qualifies as a quick hack.
53729
53730 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53731 if (VT.getSizeInBits() != 64)
53732 return SDValue();
53733
53734 const Function &F = DAG.getMachineFunction().getFunction();
53735 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53736 bool F64IsLegal =
53737 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53738
53739 if (!F64IsLegal || Subtarget.is64Bit())
53740 return SDValue();
53741
53742 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53743 cast<LoadSDNode>(St->getValue())->isSimple() &&
53744 St->getChain().hasOneUse() && St->isSimple()) {
53745 auto *Ld = cast<LoadSDNode>(St->getValue());
53746
53747 if (!ISD::isNormalLoad(Ld))
53748 return SDValue();
53749
53750 // Avoid the transformation if there are multiple uses of the loaded value.
53751 if (!Ld->hasNUsesOfValue(1, 0))
53752 return SDValue();
53753
53754 SDLoc LdDL(Ld);
53755 SDLoc StDL(N);
53756
53757 // Remove any range metadata as we're converting to f64 load/store.
53758 Ld->getMemOperand()->clearRanges();
53759
53760 // Lower to a single movq load/store pair.
53761 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53762 Ld->getBasePtr(), Ld->getMemOperand());
53763
53764 // Make sure new load is placed in same chain order.
53765 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53766 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53767 St->getMemOperand());
53768 }
53769
53770 // This is similar to the above case, but here we handle a scalar 64-bit
53771 // integer store that is extracted from a vector on a 32-bit target.
53772 // If we have SSE2, then we can treat it like a floating-point double
53773 // to get past legalization. The execution dependencies fixup pass will
53774 // choose the optimal machine instruction for the store if this really is
53775 // an integer or v2f32 rather than an f64.
53776 if (VT == MVT::i64 &&
53778 SDValue OldExtract = St->getOperand(1);
53779 SDValue ExtOp0 = OldExtract.getOperand(0);
53780 unsigned VecSize = ExtOp0.getValueSizeInBits();
53781 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53782 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53783 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53784 BitCast, OldExtract.getOperand(1));
53785 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53786 St->getPointerInfo(), St->getBaseAlign(),
53787 St->getMemOperand()->getFlags());
53788 }
53789
53790 return SDValue();
53791}
53792
53795 const X86Subtarget &Subtarget) {
53796 auto *St = cast<MemIntrinsicSDNode>(N);
53797
53798 SDValue StoredVal = N->getOperand(1);
53799 MVT VT = StoredVal.getSimpleValueType();
53800 EVT MemVT = St->getMemoryVT();
53801
53802 // Figure out which elements we demand.
53803 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53804 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53805
53806 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53807 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53808 if (N->getOpcode() != ISD::DELETED_NODE)
53809 DCI.AddToWorklist(N);
53810 return SDValue(N, 0);
53811 }
53812
53813 return SDValue();
53814}
53815
53816/// Return 'true' if this vector operation is "horizontal"
53817/// and return the operands for the horizontal operation in LHS and RHS. A
53818/// horizontal operation performs the binary operation on successive elements
53819/// of its first operand, then on successive elements of its second operand,
53820/// returning the resulting values in a vector. For example, if
53821/// A = < float a0, float a1, float a2, float a3 >
53822/// and
53823/// B = < float b0, float b1, float b2, float b3 >
53824/// then the result of doing a horizontal operation on A and B is
53825/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53826/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53827/// A horizontal-op B, for some already available A and B, and if so then LHS is
53828/// set to A, RHS to B, and the routine returns 'true'.
53829static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53830 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53831 bool IsCommutative,
53832 SmallVectorImpl<int> &PostShuffleMask,
53833 bool ForceHorizOp) {
53834 // If either operand is undef, bail out. The binop should be simplified.
53835 if (LHS.isUndef() || RHS.isUndef())
53836 return false;
53837
53838 // Look for the following pattern:
53839 // A = < float a0, float a1, float a2, float a3 >
53840 // B = < float b0, float b1, float b2, float b3 >
53841 // and
53842 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53843 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53844 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53845 // which is A horizontal-op B.
53846
53847 MVT VT = LHS.getSimpleValueType();
53848 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53849 "Unsupported vector type for horizontal add/sub");
53850 unsigned NumElts = VT.getVectorNumElements();
53851
53852 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53853 SmallVectorImpl<int> &ShuffleMask) {
53854 bool UseSubVector = false;
53855 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53856 Op.getOperand(0).getValueType().is256BitVector() &&
53857 llvm::isNullConstant(Op.getOperand(1))) {
53858 Op = Op.getOperand(0);
53859 UseSubVector = true;
53860 }
53862 SmallVector<int, 16> SrcMask, ScaledMask;
53864 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53865 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53866 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53867 })) {
53868 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53869 if (!UseSubVector && SrcOps.size() <= 2 &&
53870 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53871 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53872 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53873 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53874 }
53875 if (UseSubVector && SrcOps.size() == 1 &&
53876 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53877 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53878 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53879 ShuffleMask.assign(Mask.begin(), Mask.end());
53880 }
53881 }
53882 };
53883
53884 // View LHS in the form
53885 // LHS = VECTOR_SHUFFLE A, B, LMask
53886 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53887 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53888 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53889 SDValue A, B;
53891 GetShuffle(LHS, A, B, LMask);
53892
53893 // Likewise, view RHS in the form
53894 // RHS = VECTOR_SHUFFLE C, D, RMask
53895 SDValue C, D;
53897 GetShuffle(RHS, C, D, RMask);
53898
53899 // At least one of the operands should be a vector shuffle.
53900 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53901 if (NumShuffles == 0)
53902 return false;
53903
53904 if (LMask.empty()) {
53905 A = LHS;
53906 for (unsigned i = 0; i != NumElts; ++i)
53907 LMask.push_back(i);
53908 }
53909
53910 if (RMask.empty()) {
53911 C = RHS;
53912 for (unsigned i = 0; i != NumElts; ++i)
53913 RMask.push_back(i);
53914 }
53915
53916 // If we have an unary mask, ensure the other op is set to null.
53917 if (isUndefOrInRange(LMask, 0, NumElts))
53918 B = SDValue();
53919 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53920 A = SDValue();
53921
53922 if (isUndefOrInRange(RMask, 0, NumElts))
53923 D = SDValue();
53924 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53925 C = SDValue();
53926
53927 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53928 // RHS operands and shuffle mask.
53929 if (A != C) {
53930 std::swap(C, D);
53932 }
53933 // Check that the shuffles are both shuffling the same vectors.
53934 if (!(A == C && B == D))
53935 return false;
53936
53937 PostShuffleMask.clear();
53938 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53939
53940 // LHS and RHS are now:
53941 // LHS = shuffle A, B, LMask
53942 // RHS = shuffle A, B, RMask
53943 // Check that the masks correspond to performing a horizontal operation.
53944 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53945 // so we just repeat the inner loop if this is a 256-bit op.
53946 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53947 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53948 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53949 assert((NumEltsPer128BitChunk % 2 == 0) &&
53950 "Vector type should have an even number of elements in each lane");
53951 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53952 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53953 // Ignore undefined components.
53954 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53955 if (LIdx < 0 || RIdx < 0 ||
53956 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53957 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53958 continue;
53959
53960 // Check that successive odd/even elements are being operated on. If not,
53961 // this is not a horizontal operation.
53962 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53963 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53964 return false;
53965
53966 // Compute the post-shuffle mask index based on where the element
53967 // is stored in the HOP result, and where it needs to be moved to.
53968 int Base = LIdx & ~1u;
53969 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53970 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53971
53972 // The low half of the 128-bit result must choose from A.
53973 // The high half of the 128-bit result must choose from B,
53974 // unless B is undef. In that case, we are always choosing from A.
53975 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53976 Index += NumEltsPer64BitChunk;
53977 PostShuffleMask[i + j] = Index;
53978 }
53979 }
53980
53981 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53982 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53983
53984 bool IsIdentityPostShuffle =
53985 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53986 if (IsIdentityPostShuffle)
53987 PostShuffleMask.clear();
53988
53989 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53990 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53991 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53992 return false;
53993
53994 // If the source nodes are already used in HorizOps then always accept this.
53995 // Shuffle folding should merge these back together.
53996 auto FoundHorizUser = [&](SDNode *User) {
53997 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53998 };
53999 ForceHorizOp =
54000 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54001 llvm::any_of(NewRHS->users(), FoundHorizUser));
54002
54003 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54004 // shuffle the result.
54005 if (!ForceHorizOp &&
54006 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54007 (NumShuffles < 2 || !IsIdentityPostShuffle),
54008 DAG, Subtarget))
54009 return false;
54010
54011 LHS = DAG.getBitcast(VT, NewLHS);
54012 RHS = DAG.getBitcast(VT, NewRHS);
54013 return true;
54014}
54015
54016// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54018 const X86Subtarget &Subtarget) {
54019 EVT VT = N->getValueType(0);
54020 unsigned Opcode = N->getOpcode();
54021 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54022 SmallVector<int, 8> PostShuffleMask;
54023
54024 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54025 return N->hasOneUse() &&
54026 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54027 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54028 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54029 };
54030
54031 switch (Opcode) {
54032 case ISD::FADD:
54033 case ISD::FSUB:
54034 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54035 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54036 SDValue LHS = N->getOperand(0);
54037 SDValue RHS = N->getOperand(1);
54038 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54039 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54040 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54041 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54042 if (!PostShuffleMask.empty())
54043 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54044 DAG.getUNDEF(VT), PostShuffleMask);
54045 return HorizBinOp;
54046 }
54047 }
54048 break;
54049 case ISD::ADD:
54050 case ISD::SUB:
54051 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54052 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54053 SDValue LHS = N->getOperand(0);
54054 SDValue RHS = N->getOperand(1);
54055 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54056 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54057 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54058 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54060 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54061 };
54062 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54063 {LHS, RHS}, HOpBuilder);
54064 if (!PostShuffleMask.empty())
54065 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54066 DAG.getUNDEF(VT), PostShuffleMask);
54067 return HorizBinOp;
54068 }
54069 }
54070 break;
54071 }
54072
54073 return SDValue();
54074}
54075
54076// Try to combine the following nodes
54077// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54078// <i32 -2147483648[float -0.000000e+00]> 0
54079// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54080// <(load 4 from constant-pool)> t0, t29
54081// [t30: v16i32 = bitcast t27]
54082// t6: v16i32 = xor t7, t27[t30]
54083// t11: v16f32 = bitcast t6
54084// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54085// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54086// t22: v16f32 = bitcast t7
54087// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54088// t24: v32f16 = bitcast t23
54090 const X86Subtarget &Subtarget) {
54091 EVT VT = N->getValueType(0);
54092 SDValue LHS = N->getOperand(0);
54093 SDValue RHS = N->getOperand(1);
54094 int CombineOpcode =
54095 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54096 auto combineConjugation = [&](SDValue &r) {
54097 if (LHS->getOpcode() == ISD::BITCAST) {
54098 SDValue XOR = LHS.getOperand(0);
54099 if (XOR->getOpcode() == ISD::XOR) {
54100 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54101 if (XORRHS.isConstant()) {
54102 APInt ConjugationInt32 = APInt(32, 0x80000000);
54103 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54104 if ((XORRHS.getBitWidth() == 32 &&
54105 XORRHS.getConstant() == ConjugationInt32) ||
54106 (XORRHS.getBitWidth() == 64 &&
54107 XORRHS.getConstant() == ConjugationInt64)) {
54108 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54109 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54110 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54111 r = DAG.getBitcast(VT, FCMulC);
54112 return true;
54113 }
54114 }
54115 }
54116 }
54117 return false;
54118 };
54119 SDValue Res;
54120 if (combineConjugation(Res))
54121 return Res;
54122 std::swap(LHS, RHS);
54123 if (combineConjugation(Res))
54124 return Res;
54125 return Res;
54126}
54127
54128// Try to combine the following nodes:
54129// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54131 const X86Subtarget &Subtarget) {
54132 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54134 Flags.hasAllowContract();
54135 };
54136
54137 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54138 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54139 Flags.hasNoSignedZeros();
54140 };
54141 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54142 APInt AI = APInt(32, 0x80008000);
54143 KnownBits Bits = DAG.computeKnownBits(Op);
54144 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54145 Bits.getConstant() == AI;
54146 };
54147
54148 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54149 !AllowContract(N->getFlags()))
54150 return SDValue();
54151
54152 EVT VT = N->getValueType(0);
54153 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54154 return SDValue();
54155
54156 SDValue LHS = N->getOperand(0);
54157 SDValue RHS = N->getOperand(1);
54158 bool IsConj;
54159 SDValue FAddOp1, MulOp0, MulOp1;
54160 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54161 &IsVectorAllNegativeZero,
54162 &HasNoSignedZero](SDValue N) -> bool {
54163 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54164 return false;
54165 SDValue Op0 = N.getOperand(0);
54166 unsigned Opcode = Op0.getOpcode();
54167 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54168 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54169 MulOp0 = Op0.getOperand(0);
54170 MulOp1 = Op0.getOperand(1);
54171 IsConj = Opcode == X86ISD::VFCMULC;
54172 return true;
54173 }
54174 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54176 HasNoSignedZero(Op0->getFlags())) ||
54177 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54178 MulOp0 = Op0.getOperand(0);
54179 MulOp1 = Op0.getOperand(1);
54180 IsConj = Opcode == X86ISD::VFCMADDC;
54181 return true;
54182 }
54183 }
54184 return false;
54185 };
54186
54187 if (GetCFmulFrom(LHS))
54188 FAddOp1 = RHS;
54189 else if (GetCFmulFrom(RHS))
54190 FAddOp1 = LHS;
54191 else
54192 return SDValue();
54193
54194 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54195 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54196 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54197 // FIXME: How do we handle when fast math flags of FADD are different from
54198 // CFMUL's?
54199 SDValue CFmul =
54200 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54201 return DAG.getBitcast(VT, CFmul);
54202}
54203
54204/// Do target-specific dag combines on floating-point adds/subs.
54206 const X86Subtarget &Subtarget) {
54207 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54208 return HOp;
54209
54210 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54211 return COp;
54212
54213 return SDValue();
54214}
54215
54217 const X86Subtarget &Subtarget) {
54218 EVT VT = N->getValueType(0);
54219 SDValue Src = N->getOperand(0);
54220 EVT SrcVT = Src.getValueType();
54221 SDLoc DL(N);
54222
54223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54224
54225 // Let legalize expand this if it isn't a legal type yet.
54226 if (!TLI.isTypeLegal(VT))
54227 return SDValue();
54228
54229 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54230 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54231 return SDValue();
54232
54233 if (SrcVT == MVT::v2f16) {
54234 SrcVT = MVT::v4f16;
54235 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54236 DAG.getUNDEF(MVT::v2f16));
54237 }
54238
54239 if (SrcVT == MVT::v4f16) {
54240 SrcVT = MVT::v8f16;
54241 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54242 DAG.getUNDEF(MVT::v4f16));
54243 } else if (SrcVT == MVT::v2f32) {
54244 SrcVT = MVT::v4f32;
54245 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54246 DAG.getUNDEF(MVT::v2f32));
54247 } else {
54248 return SDValue();
54249 }
54250
54251 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54252}
54253
54254// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54255// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54256// are able to avoid generating code with MOVABS and large constants in certain
54257// cases.
54259 const SDLoc &DL) {
54260 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54261 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54262 if (!ValidSrlConst)
54263 return SDValue();
54264 unsigned SrlConstVal = *ValidSrlConst;
54265
54266 SDValue Op = N.getOperand(0);
54267 unsigned Opcode = Op.getOpcode();
54268 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54269 "Illegal truncation types");
54270
54271 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54272 !isa<ConstantSDNode>(Op.getOperand(1)))
54273 return SDValue();
54274 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54275
54276 if (SrlConstVal <= 32 ||
54277 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54278 return SDValue();
54279
54280 SDValue OpLhsSrl =
54281 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54282 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54283
54284 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54285 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54286 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54287
54288 if (Opcode == ISD::ADD) {
54289 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54290 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54291 }
54292 return NewOpNode;
54293}
54294
54295/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54296/// the codegen.
54297/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54298/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54299/// anything that is guaranteed to be transformed by DAGCombiner.
54301 const X86Subtarget &Subtarget,
54302 const SDLoc &DL) {
54303 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54304 SDValue Src = N->getOperand(0);
54305 unsigned SrcOpcode = Src.getOpcode();
54306 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54307
54308 EVT VT = N->getValueType(0);
54309 EVT SrcVT = Src.getValueType();
54310
54311 auto IsFreeTruncation = [VT](SDValue Op) {
54312 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54313
54314 // See if this has been extended from a smaller/equal size to
54315 // the truncation size, allowing a truncation to combine with the extend.
54316 unsigned Opcode = Op.getOpcode();
54317 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54318 Opcode == ISD::ZERO_EXTEND) &&
54319 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54320 return true;
54321
54322 // See if this is a single use constant which can be constant folded.
54323 // NOTE: We don't peek throught bitcasts here because there is currently
54324 // no support for constant folding truncate+bitcast+vector_of_constants. So
54325 // we'll just send up with a truncate on both operands which will
54326 // get turned back into (truncate (binop)) causing an infinite loop.
54327 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54328 };
54329
54330 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54331 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54332 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54333 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54334 };
54335
54336 // Don't combine if the operation has other uses.
54337 if (!Src.hasOneUse())
54338 return SDValue();
54339
54340 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54341 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54342
54343 if (!VT.isVector())
54344 return SDValue();
54345
54346 // In most cases its only worth pre-truncating if we're only facing the cost
54347 // of one truncation.
54348 // i.e. if one of the inputs will constant fold or the input is repeated.
54349 switch (SrcOpcode) {
54350 case ISD::MUL:
54351 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54352 // better to truncate if we have the chance.
54353 if (SrcVT.getScalarType() == MVT::i64 &&
54354 TLI.isOperationLegal(SrcOpcode, VT) &&
54355 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54356 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54357 [[fallthrough]];
54358 case ISD::AND:
54359 case ISD::XOR:
54360 case ISD::OR:
54361 case ISD::ADD:
54362 case ISD::SUB: {
54363 SDValue Op0 = Src.getOperand(0);
54364 SDValue Op1 = Src.getOperand(1);
54365 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54366 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54367 return TruncateArithmetic(Op0, Op1);
54368 break;
54369 }
54370 }
54371
54372 return SDValue();
54373}
54374
54375// Try to form a MULHU or MULHS node by looking for
54376// (trunc (srl (mul ext, ext), >= 16))
54377// TODO: This is X86 specific because we want to be able to handle wide types
54378// before type legalization. But we can only do it if the vector will be
54379// legalized via widening/splitting. Type legalization can't handle promotion
54380// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54381// combiner.
54382static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54383 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54384 using namespace llvm::SDPatternMatch;
54385
54386 if (!Subtarget.hasSSE2())
54387 return SDValue();
54388
54389 // Only handle vXi16 types that are at least 128-bits unless they will be
54390 // widened.
54391 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54392 return SDValue();
54393
54394 // Input type should be at least vXi32.
54395 EVT InVT = Src.getValueType();
54396 if (InVT.getVectorElementType().getSizeInBits() < 32)
54397 return SDValue();
54398
54399 // First instruction should be a right shift by 16 of a multiply.
54400 SDValue LHS, RHS;
54401 APInt ShiftAmt;
54402 if (!sd_match(Src,
54403 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54404 return SDValue();
54405
54406 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54407 return SDValue();
54408
54409 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54410
54411 // Count leading sign/zero bits on both inputs - if there are enough then
54412 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54413 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54414 // truncations may actually be free by peeking through to the ext source.
54415 auto IsSext = [&DAG](SDValue V) {
54416 return DAG.ComputeMaxSignificantBits(V) <= 16;
54417 };
54418 auto IsZext = [&DAG](SDValue V) {
54419 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54420 };
54421
54422 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54423 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54424 if (!IsSigned && !IsUnsigned)
54425 return SDValue();
54426
54427 // Check if both inputs are extensions, which will be removed by truncation.
54428 auto isOpTruncateFree = [](SDValue Op) {
54429 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54430 Op.getOpcode() == ISD::ZERO_EXTEND)
54431 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54432 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54433 };
54434 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54435
54436 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54437 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54438 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54439 // will have to split anyway.
54440 unsigned InSizeInBits = InVT.getSizeInBits();
54441 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54442 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54443 (InSizeInBits % 16) == 0) {
54444 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54445 InVT.getSizeInBits() / 16);
54446 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54447 DAG.getBitcast(BCVT, RHS));
54448 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54449 return DAG.getNode(ISD::SRL, DL, VT, Res,
54450 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54451 }
54452
54453 // Truncate back to source type.
54454 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54455 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54456
54457 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54458 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54459 return DAG.getNode(ISD::SRL, DL, VT, Res,
54460 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54461}
54462
54463// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54464// from one vector with signed bytes from another vector, adds together
54465// adjacent pairs of 16-bit products, and saturates the result before
54466// truncating to 16-bits.
54467//
54468// Which looks something like this:
54469// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54470// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54472 const X86Subtarget &Subtarget,
54473 const SDLoc &DL) {
54474 if (!VT.isVector() || !Subtarget.hasSSSE3())
54475 return SDValue();
54476
54477 unsigned NumElems = VT.getVectorNumElements();
54478 EVT ScalarVT = VT.getVectorElementType();
54479 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54480 return SDValue();
54481
54482 SDValue SSatVal = detectSSatPattern(In, VT);
54483 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54484 return SDValue();
54485
54486 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54487 // of multiplies from even/odd elements.
54488 SDValue N0 = SSatVal.getOperand(0);
54489 SDValue N1 = SSatVal.getOperand(1);
54490
54491 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54492 return SDValue();
54493
54494 SDValue N00 = N0.getOperand(0);
54495 SDValue N01 = N0.getOperand(1);
54496 SDValue N10 = N1.getOperand(0);
54497 SDValue N11 = N1.getOperand(1);
54498
54499 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54500 // Canonicalize zero_extend to LHS.
54501 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54502 std::swap(N00, N01);
54503 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54504 std::swap(N10, N11);
54505
54506 // Ensure we have a zero_extend and a sign_extend.
54507 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54508 N01.getOpcode() != ISD::SIGN_EXTEND ||
54509 N10.getOpcode() != ISD::ZERO_EXTEND ||
54510 N11.getOpcode() != ISD::SIGN_EXTEND)
54511 return SDValue();
54512
54513 // Peek through the extends.
54514 N00 = N00.getOperand(0);
54515 N01 = N01.getOperand(0);
54516 N10 = N10.getOperand(0);
54517 N11 = N11.getOperand(0);
54518
54519 // Ensure the extend is from vXi8.
54520 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54521 N01.getValueType().getVectorElementType() != MVT::i8 ||
54522 N10.getValueType().getVectorElementType() != MVT::i8 ||
54523 N11.getValueType().getVectorElementType() != MVT::i8)
54524 return SDValue();
54525
54526 // All inputs should be build_vectors.
54527 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54528 N01.getOpcode() != ISD::BUILD_VECTOR ||
54529 N10.getOpcode() != ISD::BUILD_VECTOR ||
54531 return SDValue();
54532
54533 // N00/N10 are zero extended. N01/N11 are sign extended.
54534
54535 // For each element, we need to ensure we have an odd element from one vector
54536 // multiplied by the odd element of another vector and the even element from
54537 // one of the same vectors being multiplied by the even element from the
54538 // other vector. So we need to make sure for each element i, this operator
54539 // is being performed:
54540 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54541 SDValue ZExtIn, SExtIn;
54542 for (unsigned i = 0; i != NumElems; ++i) {
54543 SDValue N00Elt = N00.getOperand(i);
54544 SDValue N01Elt = N01.getOperand(i);
54545 SDValue N10Elt = N10.getOperand(i);
54546 SDValue N11Elt = N11.getOperand(i);
54547 // TODO: Be more tolerant to undefs.
54548 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54549 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54550 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54552 return SDValue();
54553 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54554 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54555 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54556 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54557 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54558 return SDValue();
54559 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54560 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54561 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54562 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54563 // Add is commutative so indices can be reordered.
54564 if (IdxN00 > IdxN10) {
54565 std::swap(IdxN00, IdxN10);
54566 std::swap(IdxN01, IdxN11);
54567 }
54568 // N0 indices be the even element. N1 indices must be the next odd element.
54569 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54570 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54571 return SDValue();
54572 SDValue N00In = N00Elt.getOperand(0);
54573 SDValue N01In = N01Elt.getOperand(0);
54574 SDValue N10In = N10Elt.getOperand(0);
54575 SDValue N11In = N11Elt.getOperand(0);
54576 // First time we find an input capture it.
54577 if (!ZExtIn) {
54578 ZExtIn = N00In;
54579 SExtIn = N01In;
54580 }
54581 if (ZExtIn != N00In || SExtIn != N01In ||
54582 ZExtIn != N10In || SExtIn != N11In)
54583 return SDValue();
54584 }
54585
54586 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54587 EVT ExtVT = Ext.getValueType();
54588 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54589 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54590 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54591 DAG.getVectorIdxConstant(0, DL));
54592 }
54593 };
54594 ExtractVec(ZExtIn);
54595 ExtractVec(SExtIn);
54596
54597 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54599 // Shrink by adding truncate nodes and let DAGCombine fold with the
54600 // sources.
54601 EVT InVT = Ops[0].getValueType();
54602 assert(InVT.getScalarType() == MVT::i8 &&
54603 "Unexpected scalar element type");
54604 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54605 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54606 InVT.getVectorNumElements() / 2);
54607 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54608 };
54609 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54610 PMADDBuilder);
54611}
54612
54614 const X86Subtarget &Subtarget) {
54615 EVT VT = N->getValueType(0);
54616 SDValue Src = N->getOperand(0);
54617 SDLoc DL(N);
54618
54619 // Attempt to pre-truncate inputs to arithmetic ops instead.
54620 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54621 return V;
54622
54623 // Try to detect PMADD
54624 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54625 return PMAdd;
54626
54627 // Try to combine truncation with signed/unsigned saturation.
54628 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54629 return Val;
54630
54631 // Try to combine PMULHUW/PMULHW for vXi16.
54632 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54633 return V;
54634
54635 // The bitcast source is a direct mmx result.
54636 // Detect bitcasts between i32 to x86mmx
54637 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54638 SDValue BCSrc = Src.getOperand(0);
54639 if (BCSrc.getValueType() == MVT::x86mmx)
54640 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54641 }
54642
54643 return SDValue();
54644}
54645
54648 EVT VT = N->getValueType(0);
54649 SDValue In = N->getOperand(0);
54650 SDLoc DL(N);
54651
54652 if (SDValue SSatVal = detectSSatPattern(In, VT))
54653 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54654 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54655 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54656
54657 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54658 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54659 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54660 return SDValue(N, 0);
54661
54662 return SDValue();
54663}
54664
54665/// Returns the negated value if the node \p N flips sign of FP value.
54666///
54667/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54668/// or FSUB(0, x)
54669/// AVX512F does not have FXOR, so FNEG is lowered as
54670/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54671/// In this case we go though all bitcasts.
54672/// This also recognizes splat of a negated value and returns the splat of that
54673/// value.
54674static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54675 if (N->getOpcode() == ISD::FNEG)
54676 return N->getOperand(0);
54677
54678 // Don't recurse exponentially.
54680 return SDValue();
54681
54682 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54683
54685 EVT VT = Op->getValueType(0);
54686
54687 // Make sure the element size doesn't change.
54688 if (VT.getScalarSizeInBits() != ScalarSize)
54689 return SDValue();
54690
54691 unsigned Opc = Op.getOpcode();
54692 switch (Opc) {
54693 case ISD::VECTOR_SHUFFLE: {
54694 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54695 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54696 if (!Op.getOperand(1).isUndef())
54697 return SDValue();
54698 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54699 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54700 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54701 cast<ShuffleVectorSDNode>(Op)->getMask());
54702 break;
54703 }
54705 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54706 // -V, INDEX).
54707 SDValue InsVector = Op.getOperand(0);
54708 SDValue InsVal = Op.getOperand(1);
54709 if (!InsVector.isUndef())
54710 return SDValue();
54711 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54712 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54713 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54714 NegInsVal, Op.getOperand(2));
54715 break;
54716 }
54717 case ISD::FSUB:
54718 case ISD::XOR:
54719 case X86ISD::FXOR: {
54720 SDValue Op1 = Op.getOperand(1);
54721 SDValue Op0 = Op.getOperand(0);
54722
54723 // For XOR and FXOR, we want to check if constant
54724 // bits of Op1 are sign bit masks. For FSUB, we
54725 // have to check if constant bits of Op0 are sign
54726 // bit masks and hence we swap the operands.
54727 if (Opc == ISD::FSUB)
54728 std::swap(Op0, Op1);
54729
54730 APInt UndefElts;
54731 SmallVector<APInt, 16> EltBits;
54732 // Extract constant bits and see if they are all
54733 // sign bit masks. Ignore the undef elements.
54734 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54735 /* AllowWholeUndefs */ true,
54736 /* AllowPartialUndefs */ false)) {
54737 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54738 if (!UndefElts[I] && !EltBits[I].isSignMask())
54739 return SDValue();
54740
54741 // Only allow bitcast from correctly-sized constant.
54742 Op0 = peekThroughBitcasts(Op0);
54743 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54744 return Op0;
54745 }
54746 break;
54747 } // case
54748 } // switch
54749
54750 return SDValue();
54751}
54752
54753static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54754 bool NegRes) {
54755 if (NegMul) {
54756 switch (Opcode) {
54757 // clang-format off
54758 default: llvm_unreachable("Unexpected opcode");
54759 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54760 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54761 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54762 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54763 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54764 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54765 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54766 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54767 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54768 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54769 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54770 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54771 // clang-format on
54772 }
54773 }
54774
54775 if (NegAcc) {
54776 switch (Opcode) {
54777 // clang-format off
54778 default: llvm_unreachable("Unexpected opcode");
54779 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54780 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54781 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54782 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54783 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54784 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54785 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54786 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54787 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54788 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54789 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54790 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54791 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54792 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54793 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54794 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54795 // clang-format on
54796 }
54797 }
54798
54799 if (NegRes) {
54800 switch (Opcode) {
54801 // For accuracy reason, we never combine fneg and fma under strict FP.
54802 // clang-format off
54803 default: llvm_unreachable("Unexpected opcode");
54804 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54805 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54806 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54807 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54808 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54809 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54810 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54811 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54812 // clang-format on
54813 }
54814 }
54815
54816 return Opcode;
54817}
54818
54819/// Do target-specific dag combines on floating point negations.
54822 const X86Subtarget &Subtarget) {
54823 EVT OrigVT = N->getValueType(0);
54824 SDValue Arg = isFNEG(DAG, N);
54825 if (!Arg)
54826 return SDValue();
54827
54828 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54829 EVT VT = Arg.getValueType();
54830 EVT SVT = VT.getScalarType();
54831 SDLoc DL(N);
54832
54833 // Let legalize expand this if it isn't a legal type yet.
54834 if (!TLI.isTypeLegal(VT))
54835 return SDValue();
54836
54837 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54838 // use of a constant by performing (-0 - A*B) instead.
54839 // FIXME: Check rounding control flags as well once it becomes available.
54840 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54841 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54842 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54843 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54844 Arg.getOperand(1), Zero);
54845 return DAG.getBitcast(OrigVT, NewNode);
54846 }
54847
54849 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54850 if (SDValue NegArg =
54851 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54852 return DAG.getBitcast(OrigVT, NegArg);
54853
54854 return SDValue();
54855}
54856
54858 bool LegalOperations,
54859 bool ForCodeSize,
54861 unsigned Depth) const {
54862 // fneg patterns are removable even if they have multiple uses.
54863 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54865 return DAG.getBitcast(Op.getValueType(), Arg);
54866 }
54867
54868 EVT VT = Op.getValueType();
54869 EVT SVT = VT.getScalarType();
54870 unsigned Opc = Op.getOpcode();
54871 SDNodeFlags Flags = Op.getNode()->getFlags();
54872 switch (Opc) {
54873 case ISD::FMA:
54874 case X86ISD::FMSUB:
54875 case X86ISD::FNMADD:
54876 case X86ISD::FNMSUB:
54877 case X86ISD::FMADD_RND:
54878 case X86ISD::FMSUB_RND:
54879 case X86ISD::FNMADD_RND:
54880 case X86ISD::FNMSUB_RND: {
54881 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54882 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54884 break;
54885
54886 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54887 // if it may have signed zeros.
54888 if (!Flags.hasNoSignedZeros())
54889 break;
54890
54891 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54892 // keep temporary nodes alive.
54893 std::list<HandleSDNode> Handles;
54894
54895 // This is always negatible for free but we might be able to remove some
54896 // extra operand negations as well.
54897 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54898 for (int i = 0; i != 3; ++i) {
54899 NewOps[i] = getCheaperNegatedExpression(
54900 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54901 if (!!NewOps[i])
54902 Handles.emplace_back(NewOps[i]);
54903 }
54904
54905 bool NegA = !!NewOps[0];
54906 bool NegB = !!NewOps[1];
54907 bool NegC = !!NewOps[2];
54908 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54909
54910 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54912
54913 // Fill in the non-negated ops with the original values.
54914 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54915 if (!NewOps[i])
54916 NewOps[i] = Op.getOperand(i);
54917 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54918 }
54919 case X86ISD::FRCP:
54920 if (SDValue NegOp0 =
54921 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54922 ForCodeSize, Cost, Depth + 1))
54923 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54924 break;
54925 }
54926
54927 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54928 ForCodeSize, Cost, Depth);
54929}
54930
54932 const X86Subtarget &Subtarget) {
54933 MVT VT = N->getSimpleValueType(0);
54934 // If we have integer vector types available, use the integer opcodes.
54935 if (!VT.isVector() || !Subtarget.hasSSE2())
54936 return SDValue();
54937
54938 SDLoc dl(N);
54940 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54941 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54942 unsigned IntOpcode;
54943 switch (N->getOpcode()) {
54944 // clang-format off
54945 default: llvm_unreachable("Unexpected FP logic op");
54946 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54947 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54948 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54949 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54950 // clang-format on
54951 }
54952 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54953 return DAG.getBitcast(VT, IntOp);
54954}
54955
54956/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54958 if (N->getOpcode() != ISD::XOR)
54959 return SDValue();
54960
54961 SDValue LHS = N->getOperand(0);
54962 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54963 return SDValue();
54964
54966 X86::CondCode(LHS->getConstantOperandVal(0)));
54967 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54968}
54969
54971 const X86Subtarget &Subtarget) {
54972 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54973 "Invalid opcode for combing with CTLZ");
54974 if (Subtarget.hasFastLZCNT())
54975 return SDValue();
54976
54977 EVT VT = N->getValueType(0);
54978 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54979 (VT != MVT::i64 || !Subtarget.is64Bit()))
54980 return SDValue();
54981
54982 SDValue N0 = N->getOperand(0);
54983 SDValue N1 = N->getOperand(1);
54984
54985 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54987 return SDValue();
54988
54989 SDValue OpCTLZ;
54990 SDValue OpSizeTM1;
54991
54992 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54993 OpCTLZ = N1;
54994 OpSizeTM1 = N0;
54995 } else if (N->getOpcode() == ISD::SUB) {
54996 return SDValue();
54997 } else {
54998 OpCTLZ = N0;
54999 OpSizeTM1 = N1;
55000 }
55001
55002 if (!OpCTLZ.hasOneUse())
55003 return SDValue();
55004 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55005 if (!C)
55006 return SDValue();
55007
55008 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55009 return SDValue();
55010 EVT OpVT = VT;
55011 SDValue Op = OpCTLZ.getOperand(0);
55012 if (VT == MVT::i8) {
55013 // Zero extend to i32 since there is not an i8 bsr.
55014 OpVT = MVT::i32;
55015 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55016 }
55017
55018 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55019 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55020 if (VT == MVT::i8)
55021 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55022
55023 return Op;
55024}
55025
55028 const X86Subtarget &Subtarget) {
55029 SDValue N0 = N->getOperand(0);
55030 SDValue N1 = N->getOperand(1);
55031 EVT VT = N->getValueType(0);
55032 SDLoc DL(N);
55033
55034 // If this is SSE1 only convert to FXOR to avoid scalarization.
55035 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55036 return DAG.getBitcast(MVT::v4i32,
55037 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55038 DAG.getBitcast(MVT::v4f32, N0),
55039 DAG.getBitcast(MVT::v4f32, N1)));
55040 }
55041
55042 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55043 return Cmp;
55044
55045 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55046 return R;
55047
55048 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55049 return R;
55050
55051 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55052 return R;
55053
55054 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55055 DAG, DCI, Subtarget))
55056 return FPLogic;
55057
55058 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55059 return R;
55060
55061 if (DCI.isBeforeLegalizeOps())
55062 return SDValue();
55063
55064 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55065 return SetCC;
55066
55067 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55068 return R;
55069
55070 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55071 return RV;
55072
55073 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55074 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55075 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55076 N0.getOperand(0).getValueType().isVector() &&
55077 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55078 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55079 return DAG.getBitcast(
55080 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55081 }
55082
55083 // Handle AVX512 mask widening.
55084 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55085 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55086 VT.getVectorElementType() == MVT::i1 &&
55088 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55089 return DAG.getNode(
55091 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55092 N0.getOperand(2));
55093 }
55094
55095 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55096 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55097 // TODO: Under what circumstances could this be performed in DAGCombine?
55098 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55099 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55100 SDValue TruncExtSrc = N0.getOperand(0);
55101 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55102 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55103 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55104 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55105 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55106 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55107 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55108 }
55109 }
55110
55111 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55112 return R;
55113
55114 return combineFneg(N, DAG, DCI, Subtarget);
55115}
55116
55119 const X86Subtarget &Subtarget) {
55120 SDValue N0 = N->getOperand(0);
55121 EVT VT = N->getValueType(0);
55122
55123 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55124 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55125 SDValue Src = N0.getOperand(0);
55126 EVT SrcVT = Src.getValueType();
55127 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55128 (DCI.isBeforeLegalize() ||
55129 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55130 Subtarget.hasSSSE3()) {
55131 unsigned NumElts = SrcVT.getVectorNumElements();
55132 SmallVector<int, 32> ReverseMask(NumElts);
55133 for (unsigned I = 0; I != NumElts; ++I)
55134 ReverseMask[I] = (NumElts - 1) - I;
55135 SDValue Rev =
55136 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55137 return DAG.getBitcast(VT, Rev);
55138 }
55139 }
55140
55141 return SDValue();
55142}
55143
55144// Various combines to try to convert to avgceilu.
55147 const X86Subtarget &Subtarget) {
55148 unsigned Opcode = N->getOpcode();
55149 SDValue N0 = N->getOperand(0);
55150 SDValue N1 = N->getOperand(1);
55151 EVT VT = N->getValueType(0);
55152 EVT SVT = VT.getScalarType();
55153 SDLoc DL(N);
55154
55155 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55156 // Only useful on vXi8 which doesn't have good SRA handling.
55157 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55159 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55160 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55161 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55162 return DAG.getNode(ISD::XOR, DL, VT,
55163 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55164 }
55165
55166 return SDValue();
55167}
55168
55171 const X86Subtarget &Subtarget) {
55172 EVT VT = N->getValueType(0);
55173 unsigned NumBits = VT.getSizeInBits();
55174
55175 // TODO - Constant Folding.
55176
55177 // Simplify the inputs.
55178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55179 APInt DemandedMask(APInt::getAllOnes(NumBits));
55180 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55181 return SDValue(N, 0);
55182
55183 return SDValue();
55184}
55185
55187 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55188}
55189
55190/// If a value is a scalar FP zero or a vector FP zero (potentially including
55191/// undefined elements), return a zero constant that may be used to fold away
55192/// that value. In the case of a vector, the returned constant will not contain
55193/// undefined elements even if the input parameter does. This makes it suitable
55194/// to be used as a replacement operand with operations (eg, bitwise-and) where
55195/// an undef should not propagate.
55197 const X86Subtarget &Subtarget) {
55199 return SDValue();
55200
55201 if (V.getValueType().isVector())
55202 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55203
55204 return V;
55205}
55206
55208 const X86Subtarget &Subtarget) {
55209 SDValue N0 = N->getOperand(0);
55210 SDValue N1 = N->getOperand(1);
55211 EVT VT = N->getValueType(0);
55212 SDLoc DL(N);
55213
55214 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55215 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55216 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55217 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55218 return SDValue();
55219
55220 auto isAllOnesConstantFP = [](SDValue V) {
55221 if (V.getSimpleValueType().isVector())
55222 return ISD::isBuildVectorAllOnes(V.getNode());
55223 auto *C = dyn_cast<ConstantFPSDNode>(V);
55224 return C && C->getConstantFPValue()->isAllOnesValue();
55225 };
55226
55227 // fand (fxor X, -1), Y --> fandn X, Y
55228 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55229 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55230
55231 // fand X, (fxor Y, -1) --> fandn Y, X
55232 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55233 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55234
55235 return SDValue();
55236}
55237
55238/// Do target-specific dag combines on X86ISD::FAND nodes.
55240 const X86Subtarget &Subtarget) {
55241 // FAND(0.0, x) -> 0.0
55242 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55243 return V;
55244
55245 // FAND(x, 0.0) -> 0.0
55246 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55247 return V;
55248
55249 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55250 return V;
55251
55252 return lowerX86FPLogicOp(N, DAG, Subtarget);
55253}
55254
55255/// Do target-specific dag combines on X86ISD::FANDN nodes.
55257 const X86Subtarget &Subtarget) {
55258 // FANDN(0.0, x) -> x
55259 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55260 return N->getOperand(1);
55261
55262 // FANDN(x, 0.0) -> 0.0
55263 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55264 return V;
55265
55266 return lowerX86FPLogicOp(N, DAG, Subtarget);
55267}
55268
55269/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55272 const X86Subtarget &Subtarget) {
55273 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55274
55275 // F[X]OR(0.0, x) -> x
55276 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55277 return N->getOperand(1);
55278
55279 // F[X]OR(x, 0.0) -> x
55280 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55281 return N->getOperand(0);
55282
55283 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55284 return NewVal;
55285
55286 return lowerX86FPLogicOp(N, DAG, Subtarget);
55287}
55288
55289/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55291 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55292
55293 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55294 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55296 return SDValue();
55297
55298 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55299 // into FMINC and FMAXC, which are Commutative operations.
55300 unsigned NewOp = 0;
55301 switch (N->getOpcode()) {
55302 default: llvm_unreachable("unknown opcode");
55303 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55304 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55305 }
55306
55307 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55308 N->getOperand(0), N->getOperand(1));
55309}
55310
55312 const X86Subtarget &Subtarget) {
55313 EVT VT = N->getValueType(0);
55314 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55315 return SDValue();
55316
55317 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55318
55319 auto IsMinMaxLegal = [&](EVT VT) {
55320 if (!TLI.isTypeLegal(VT))
55321 return false;
55322 return VT.getScalarType() != MVT::f16 ||
55323 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55324 };
55325
55326 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55327 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55328 (Subtarget.hasFP16() && VT == MVT::f16) ||
55329 (VT.isVector() && IsMinMaxLegal(VT))))
55330 return SDValue();
55331
55332 SDValue Op0 = N->getOperand(0);
55333 SDValue Op1 = N->getOperand(1);
55334 SDLoc DL(N);
55335 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55336
55337 // If we don't have to respect NaN inputs, this is a direct translation to x86
55338 // min/max instructions.
55339 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55340 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55341
55342 // If one of the operands is known non-NaN use the native min/max instructions
55343 // with the non-NaN input as second operand.
55344 if (DAG.isKnownNeverNaN(Op1))
55345 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55346 if (DAG.isKnownNeverNaN(Op0))
55347 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55348
55349 // If we have to respect NaN inputs, this takes at least 3 instructions.
55350 // Favor a library call when operating on a scalar and minimizing code size.
55351 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55352 return SDValue();
55353
55354 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55355 VT);
55356
55357 // There are 4 possibilities involving NaN inputs, and these are the required
55358 // outputs:
55359 // Op1
55360 // Num NaN
55361 // ----------------
55362 // Num | Max | Op0 |
55363 // Op0 ----------------
55364 // NaN | Op1 | NaN |
55365 // ----------------
55366 //
55367 // The SSE FP max/min instructions were not designed for this case, but rather
55368 // to implement:
55369 // Min = Op1 < Op0 ? Op1 : Op0
55370 // Max = Op1 > Op0 ? Op1 : Op0
55371 //
55372 // So they always return Op0 if either input is a NaN. However, we can still
55373 // use those instructions for fmaxnum by selecting away a NaN input.
55374
55375 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55376 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55377 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55378
55379 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55380 // are NaN, the NaN value of Op1 is the result.
55381 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55382}
55383
55386 EVT VT = N->getValueType(0);
55387 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55388
55389 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55390 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55391 return SDValue(N, 0);
55392
55393 // Convert a full vector load into vzload when not all bits are needed.
55394 SDValue In = N->getOperand(0);
55395 MVT InVT = In.getSimpleValueType();
55396 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55397 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55398 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55399 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55400 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55401 MVT MemVT = MVT::getIntegerVT(NumBits);
55402 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55403 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55404 SDLoc dl(N);
55405 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55406 DAG.getBitcast(InVT, VZLoad));
55407 DCI.CombineTo(N, Convert);
55408 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55410 return SDValue(N, 0);
55411 }
55412 }
55413
55414 return SDValue();
55415}
55416
55420 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55421 EVT VT = N->getValueType(0);
55422
55423 // Convert a full vector load into vzload when not all bits are needed.
55424 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55425 MVT InVT = In.getSimpleValueType();
55426 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55427 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55428 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55429 LoadSDNode *LN = cast<LoadSDNode>(In);
55430 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55431 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55432 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55433 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55434 SDLoc dl(N);
55435 if (IsStrict) {
55436 SDValue Convert =
55437 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55438 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55439 DCI.CombineTo(N, Convert, Convert.getValue(1));
55440 } else {
55441 SDValue Convert =
55442 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55443 DCI.CombineTo(N, Convert);
55444 }
55445 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55447 return SDValue(N, 0);
55448 }
55449 }
55450
55451 return SDValue();
55452}
55453
55454/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55457 const X86Subtarget &Subtarget) {
55458 SDValue N0 = N->getOperand(0);
55459 SDValue N1 = N->getOperand(1);
55460 MVT VT = N->getSimpleValueType(0);
55461 int NumElts = VT.getVectorNumElements();
55462 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55463 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55464 SDLoc DL(N);
55465
55466 // ANDNP(undef, x) -> 0
55467 // ANDNP(x, undef) -> 0
55468 if (N0.isUndef() || N1.isUndef())
55469 return DAG.getConstant(0, DL, VT);
55470
55471 // ANDNP(0, x) -> x
55473 return N1;
55474
55475 // ANDNP(x, 0) -> 0
55477 return DAG.getConstant(0, DL, VT);
55478
55479 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55481 return DAG.getNOT(DL, N0, VT);
55482
55483 // Turn ANDNP back to AND if input is inverted.
55484 if (SDValue Not = IsNOT(N0, DAG))
55485 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55486
55487 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55488 // to make use of predicated selects.
55489 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55490 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55491 SDValue Src = N0.getOperand(0);
55492 EVT SrcVT = Src.getValueType();
55493 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55494 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55495 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55496 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55497 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55498 getZeroVector(VT, Subtarget, DAG, DL));
55499 }
55500
55501 // Constant Folding
55502 APInt Undefs0, Undefs1;
55503 SmallVector<APInt> EltBits0, EltBits1;
55504 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55505 /*AllowWholeUndefs*/ true,
55506 /*AllowPartialUndefs*/ true)) {
55507 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55508 /*AllowWholeUndefs*/ true,
55509 /*AllowPartialUndefs*/ true)) {
55510 SmallVector<APInt> ResultBits;
55511 for (int I = 0; I != NumElts; ++I)
55512 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55513 return getConstVector(ResultBits, VT, DAG, DL);
55514 }
55515
55516 // Constant fold NOT(N0) to allow us to use AND.
55517 // Ensure this is only performed if we can confirm that the bitcasted source
55518 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55519 if (N0->hasOneUse()) {
55521 if (BC0.getOpcode() != ISD::BITCAST) {
55522 for (APInt &Elt : EltBits0)
55523 Elt = ~Elt;
55524 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55525 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55526 }
55527 }
55528 }
55529
55530 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55531 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55532 SDValue Op(N, 0);
55533 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55534 return Res;
55535
55536 // If either operand is a constant mask, then only the elements that aren't
55537 // zero are actually demanded by the other operand.
55538 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55539 APInt UndefElts;
55540 SmallVector<APInt> EltBits;
55541 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55542 APInt DemandedElts = APInt::getAllOnes(NumElts);
55543 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55544 EltBits)) {
55545 DemandedBits.clearAllBits();
55546 DemandedElts.clearAllBits();
55547 for (int I = 0; I != NumElts; ++I) {
55548 if (UndefElts[I]) {
55549 // We can't assume an undef src element gives an undef dst - the
55550 // other src might be zero.
55551 DemandedBits.setAllBits();
55552 DemandedElts.setBit(I);
55553 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55554 (!Invert && !EltBits[I].isZero())) {
55555 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55556 DemandedElts.setBit(I);
55557 }
55558 }
55559 }
55560 return std::make_pair(DemandedBits, DemandedElts);
55561 };
55562 APInt Bits0, Elts0;
55563 APInt Bits1, Elts1;
55564 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55565 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55566
55567 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55568 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55569 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55570 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55571 if (N->getOpcode() != ISD::DELETED_NODE)
55572 DCI.AddToWorklist(N);
55573 return SDValue(N, 0);
55574 }
55575 }
55576
55577 // Folds for better commutativity:
55578 if (N1->hasOneUse()) {
55579 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55580 if (SDValue Not = IsNOT(N1, DAG))
55581 return DAG.getNOT(
55582 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55583
55584 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55585 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55586 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55588 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55589 EVT ShufVT = BC1.getValueType();
55590 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55591 DAG.getBitcast(ShufVT, N0));
55592 SDValue NewShuf =
55593 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55594 return DAG.getBitcast(VT, NewShuf);
55595 }
55596 }
55597 }
55598
55599 return SDValue();
55600}
55601
55604 SDValue N1 = N->getOperand(1);
55605
55606 // BT ignores high bits in the bit index operand.
55607 unsigned BitWidth = N1.getValueSizeInBits();
55609 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55610 if (N->getOpcode() != ISD::DELETED_NODE)
55611 DCI.AddToWorklist(N);
55612 return SDValue(N, 0);
55613 }
55614
55615 return SDValue();
55616}
55617
55620 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55621 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55622
55623 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55625 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55626 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55627 if (N->getOpcode() != ISD::DELETED_NODE)
55628 DCI.AddToWorklist(N);
55629 return SDValue(N, 0);
55630 }
55631
55632 // Convert a full vector load into vzload when not all bits are needed.
55633 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55634 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55635 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55636 SDLoc dl(N);
55637 if (IsStrict) {
55638 SDValue Convert = DAG.getNode(
55639 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55640 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55641 DCI.CombineTo(N, Convert, Convert.getValue(1));
55642 } else {
55643 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55644 DAG.getBitcast(MVT::v8i16, VZLoad));
55645 DCI.CombineTo(N, Convert);
55646 }
55647
55648 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55650 return SDValue(N, 0);
55651 }
55652 }
55653 }
55654
55655 return SDValue();
55656}
55657
55658// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55660 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55661
55662 EVT DstVT = N->getValueType(0);
55663
55664 SDValue N0 = N->getOperand(0);
55665 SDValue N1 = N->getOperand(1);
55666 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55667
55668 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55669 return SDValue();
55670
55671 // Look through single use any_extends / truncs.
55672 SDValue IntermediateBitwidthOp;
55673 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55674 N0.hasOneUse()) {
55675 IntermediateBitwidthOp = N0;
55676 N0 = N0.getOperand(0);
55677 }
55678
55679 // See if we have a single use cmov.
55680 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55681 return SDValue();
55682
55683 SDValue CMovOp0 = N0.getOperand(0);
55684 SDValue CMovOp1 = N0.getOperand(1);
55685
55686 // Make sure both operands are constants.
55687 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55688 !isa<ConstantSDNode>(CMovOp1.getNode()))
55689 return SDValue();
55690
55691 SDLoc DL(N);
55692
55693 // If we looked through an any_extend/trunc above, add one to the constants.
55694 if (IntermediateBitwidthOp) {
55695 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55696 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55697 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55698 }
55699
55700 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55701 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55702
55703 EVT CMovVT = DstVT;
55704 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55705 if (DstVT == MVT::i16) {
55706 CMovVT = MVT::i32;
55707 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55708 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55709 }
55710
55711 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55712 N0.getOperand(2), N0.getOperand(3));
55713
55714 if (CMovVT != DstVT)
55715 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55716
55717 return CMov;
55718}
55719
55721 const X86Subtarget &Subtarget) {
55722 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55723
55724 if (SDValue V = combineSextInRegCmov(N, DAG))
55725 return V;
55726
55727 EVT VT = N->getValueType(0);
55728 SDValue N0 = N->getOperand(0);
55729 SDValue N1 = N->getOperand(1);
55730 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55731 SDLoc dl(N);
55732
55733 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55734 // both SSE and AVX2 since there is no sign-extended shift right
55735 // operation on a vector with 64-bit elements.
55736 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55737 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55738 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55739 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55740 SDValue N00 = N0.getOperand(0);
55741
55742 // EXTLOAD has a better solution on AVX2,
55743 // it may be replaced with X86ISD::VSEXT node.
55744 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55745 if (!ISD::isNormalLoad(N00.getNode()))
55746 return SDValue();
55747
55748 // Attempt to promote any comparison mask ops before moving the
55749 // SIGN_EXTEND_INREG in the way.
55750 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55751 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55752
55753 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55754 SDValue Tmp =
55755 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55756 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55757 }
55758 }
55759 return SDValue();
55760}
55761
55762/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55763/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55764/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55765/// opportunities to combine math ops, use an LEA, or use a complex addressing
55766/// mode. This can eliminate extend, add, and shift instructions.
55768 const X86Subtarget &Subtarget) {
55769 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55770 Ext->getOpcode() != ISD::ZERO_EXTEND)
55771 return SDValue();
55772
55773 // TODO: This should be valid for other integer types.
55774 EVT VT = Ext->getValueType(0);
55775 if (VT != MVT::i64)
55776 return SDValue();
55777
55778 SDValue Add = Ext->getOperand(0);
55779 if (Add.getOpcode() != ISD::ADD)
55780 return SDValue();
55781
55782 SDValue AddOp0 = Add.getOperand(0);
55783 SDValue AddOp1 = Add.getOperand(1);
55784 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55785 bool NSW = Add->getFlags().hasNoSignedWrap();
55786 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55787 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55788 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55789
55790 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55791 // into the 'zext'
55792 if ((Sext && !NSW) || (!Sext && !NUW))
55793 return SDValue();
55794
55795 // Having a constant operand to the 'add' ensures that we are not increasing
55796 // the instruction count because the constant is extended for free below.
55797 // A constant operand can also become the displacement field of an LEA.
55798 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55799 if (!AddOp1C)
55800 return SDValue();
55801
55802 // Don't make the 'add' bigger if there's no hope of combining it with some
55803 // other 'add' or 'shl' instruction.
55804 // TODO: It may be profitable to generate simpler LEA instructions in place
55805 // of single 'add' instructions, but the cost model for selecting an LEA
55806 // currently has a high threshold.
55807 bool HasLEAPotential = false;
55808 for (auto *User : Ext->users()) {
55809 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55810 HasLEAPotential = true;
55811 break;
55812 }
55813 }
55814 if (!HasLEAPotential)
55815 return SDValue();
55816
55817 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55818 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55819 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55820 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55821
55822 // The wider add is guaranteed to not wrap because both operands are
55823 // sign-extended.
55824 SDNodeFlags Flags;
55825 Flags.setNoSignedWrap(NSW);
55826 Flags.setNoUnsignedWrap(NUW);
55827 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55828}
55829
55830// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55831// operands and the result of CMOV is not used anywhere else - promote CMOV
55832// itself instead of promoting its result. This could be beneficial, because:
55833// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55834// (or more) pseudo-CMOVs only when they go one-after-another and
55835// getting rid of result extension code after CMOV will help that.
55836// 2) Promotion of constant CMOV arguments is free, hence the
55837// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55838// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55839// promotion is also good in terms of code-size.
55840// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55841// promotion).
55843 SDValue CMovN = Extend->getOperand(0);
55844 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55845 return SDValue();
55846
55847 EVT TargetVT = Extend->getValueType(0);
55848 unsigned ExtendOpcode = Extend->getOpcode();
55849 SDLoc DL(Extend);
55850
55851 EVT VT = CMovN.getValueType();
55852 SDValue CMovOp0 = CMovN.getOperand(0);
55853 SDValue CMovOp1 = CMovN.getOperand(1);
55854
55855 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55856 !isa<ConstantSDNode>(CMovOp1.getNode()))
55857 return SDValue();
55858
55859 // Only extend to i32 or i64.
55860 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55861 return SDValue();
55862
55863 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55864 // are free.
55865 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55866 return SDValue();
55867
55868 // If this a zero extend to i64, we should only extend to i32 and use a free
55869 // zero extend to finish.
55870 EVT ExtendVT = TargetVT;
55871 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55872 ExtendVT = MVT::i32;
55873
55874 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55875 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55876
55877 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55878 CMovN.getOperand(2), CMovN.getOperand(3));
55879
55880 // Finish extending if needed.
55881 if (ExtendVT != TargetVT)
55882 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55883
55884 return Res;
55885}
55886
55887// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55888// result type.
55890 const X86Subtarget &Subtarget) {
55891 SDValue N0 = N->getOperand(0);
55892 EVT VT = N->getValueType(0);
55893 SDLoc dl(N);
55894
55895 // Only do this combine with AVX512 for vector extends.
55896 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55897 return SDValue();
55898
55899 // Only combine legal element types.
55900 EVT SVT = VT.getVectorElementType();
55901 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55902 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55903 return SDValue();
55904
55905 // We don't have CMPP Instruction for vxf16
55906 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55907 return SDValue();
55908 // We can only do this if the vector size in 256 bits or less.
55909 unsigned Size = VT.getSizeInBits();
55910 if (Size > 256 && Subtarget.useAVX512Regs())
55911 return SDValue();
55912
55913 EVT N00VT = N0.getOperand(0).getValueType();
55914
55915 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55916 // that's the only integer compares with we have.
55918 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55919 return SDValue();
55920
55921 // Only do this combine if the extension will be fully consumed by the setcc.
55922 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55923 if (Size != MatchingVecType.getSizeInBits())
55924 return SDValue();
55925
55926 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55927
55928 if (N->getOpcode() == ISD::ZERO_EXTEND)
55929 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55930
55931 return Res;
55932}
55933
55936 const X86Subtarget &Subtarget) {
55937 SDValue N0 = N->getOperand(0);
55938 EVT VT = N->getValueType(0);
55939 SDLoc DL(N);
55940
55941 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55942 if (!DCI.isBeforeLegalizeOps() &&
55944 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55945 N0->getOperand(1));
55946 bool ReplaceOtherUses = !N0.hasOneUse();
55947 DCI.CombineTo(N, Setcc);
55948 // Replace other uses with a truncate of the widened setcc_carry.
55949 if (ReplaceOtherUses) {
55950 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55951 N0.getValueType(), Setcc);
55952 DCI.CombineTo(N0.getNode(), Trunc);
55953 }
55954
55955 return SDValue(N, 0);
55956 }
55957
55958 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55959 return NewCMov;
55960
55961 if (!DCI.isBeforeLegalizeOps())
55962 return SDValue();
55963
55964 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55965 return V;
55966
55967 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55968 DAG, DCI, Subtarget))
55969 return V;
55970
55971 if (VT.isVector()) {
55972 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55973 return R;
55974
55976 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55977 }
55978
55979 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55980 return NewAdd;
55981
55982 return SDValue();
55983}
55984
55985// Inverting a constant vector is profitable if it can be eliminated and the
55986// inverted vector is already present in DAG. Otherwise, it will be loaded
55987// anyway.
55988//
55989// We determine which of the values can be completely eliminated and invert it.
55990// If both are eliminable, select a vector with the first negative element.
55993 "ConstantFP build vector expected");
55994 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55995 // can eliminate it. Since this function is invoked for each FMA with this
55996 // vector.
55997 auto IsNotFMA = [](SDNode *User) {
55998 return User->getOpcode() != ISD::FMA &&
55999 User->getOpcode() != ISD::STRICT_FMA;
56000 };
56001 if (llvm::any_of(V->users(), IsNotFMA))
56002 return SDValue();
56003
56005 EVT VT = V.getValueType();
56006 EVT EltVT = VT.getVectorElementType();
56007 for (const SDValue &Op : V->op_values()) {
56008 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56009 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56010 } else {
56011 assert(Op.isUndef());
56012 Ops.push_back(DAG.getUNDEF(EltVT));
56013 }
56014 }
56015
56017 if (!NV)
56018 return SDValue();
56019
56020 // If an inverted version cannot be eliminated, choose it instead of the
56021 // original version.
56022 if (llvm::any_of(NV->users(), IsNotFMA))
56023 return SDValue(NV, 0);
56024
56025 // If the inverted version also can be eliminated, we have to consistently
56026 // prefer one of the values. We prefer a constant with a negative value on
56027 // the first place.
56028 // N.B. We need to skip undefs that may precede a value.
56029 for (const SDValue &Op : V->op_values()) {
56030 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56031 if (Cst->isNegative())
56032 return SDValue();
56033 break;
56034 }
56035 }
56036 return SDValue(NV, 0);
56037}
56038
56041 const X86Subtarget &Subtarget) {
56042 SDLoc dl(N);
56043 EVT VT = N->getValueType(0);
56045 bool IsStrict = N->isTargetOpcode()
56046 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56047 : N->isStrictFPOpcode();
56048
56049 // Let legalize expand this if it isn't a legal type yet.
56050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56051 if (!TLI.isTypeLegal(VT))
56052 return SDValue();
56053
56054 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56055 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56056 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56057
56058 // If the operation allows fast-math and the target does not support FMA,
56059 // split this into mul+add to avoid libcall(s).
56060 SDNodeFlags Flags = N->getFlags();
56061 if (!IsStrict && Flags.hasAllowReassociation() &&
56062 TLI.isOperationExpand(ISD::FMA, VT)) {
56063 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56064 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56065 }
56066
56067 EVT ScalarVT = VT.getScalarType();
56068 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56069 !Subtarget.hasAnyFMA()) &&
56070 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56071 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56072 return SDValue();
56073
56074 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56076 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56077 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56078 CodeSize)) {
56079 V = NegV;
56080 return true;
56081 }
56082 // Look through extract_vector_elts. If it comes from an FNEG, create a
56083 // new extract from the FNEG input.
56084 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56085 isNullConstant(V.getOperand(1))) {
56086 SDValue Vec = V.getOperand(0);
56087 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56088 Vec, DAG, LegalOperations, CodeSize)) {
56089 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56090 NegV, V.getOperand(1));
56091 return true;
56092 }
56093 }
56094 // Lookup if there is an inverted version of constant vector V in DAG.
56095 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56096 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56097 V = NegV;
56098 return true;
56099 }
56100 }
56101 return false;
56102 };
56103
56104 // Do not convert the passthru input of scalar intrinsics.
56105 // FIXME: We could allow negations of the lower element only.
56106 bool NegA = invertIfNegative(A);
56107 // Create a dummy use for A so that in the process of negating B or C
56108 // recursively, it is not deleted.
56109 HandleSDNode NegAHandle(A);
56110 bool NegB = invertIfNegative(B);
56111 // Similar to A, get a handle on B.
56112 HandleSDNode NegBHandle(B);
56113 bool NegC = invertIfNegative(C);
56114
56115 if (!NegA && !NegB && !NegC)
56116 return SDValue();
56117
56118 unsigned NewOpcode =
56119 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56120
56121 // Propagate fast-math-flags to new FMA node.
56122 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56123 if (IsStrict) {
56124 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56125 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56126 {N->getOperand(0), A, B, C});
56127 } else {
56128 if (N->getNumOperands() == 4)
56129 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56130 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56131 }
56132}
56133
56134// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56135// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56138 SDLoc dl(N);
56139 EVT VT = N->getValueType(0);
56140 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56142 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56143
56144 SDValue N2 = N->getOperand(2);
56145
56146 SDValue NegN2 =
56147 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56148 if (!NegN2)
56149 return SDValue();
56150 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56151
56152 if (N->getNumOperands() == 4)
56153 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56154 NegN2, N->getOperand(3));
56155 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56156 NegN2);
56157}
56158
56159// Try to widen the build vector and bitcast it to the type of zext.
56160// This is a special case for the 128-bit vector types. Intention is to remove
56161// the zext and replace it with a bitcast the wider type. While lowering
56162// the bitcast is removed and extra commutation due to zext is avoided.
56163// For example:
56164// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56165// build_vector (x, 0, y, 0, z, w, 0)
56167
56168 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56169 return SDValue();
56170
56171 EVT ExtendVT = Extend->getValueType(0);
56172
56173 SDValue BV = Extend->getOperand(0);
56174 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56175 return SDValue();
56176
56177 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56178 // If the build vector has undef elements, we cannot widen it.
56179 // The widening would create a vector with more undef elements, which
56180 // is not valid.
56181 return SDValue();
56182 }
56183
56184 if (!all_of(BV->op_values(),
56185 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56186 // If the build vector any element other than \ISD::LOAD, we cannot widen
56187 // it.
56188 return SDValue();
56189 }
56190
56191 SDLoc dl(BV);
56192 EVT VT = BV.getValueType();
56193 EVT EltVT = BV.getOperand(0).getValueType();
56194 unsigned NumElts = VT.getVectorNumElements();
56195
56196 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56197
56198 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56200 return SDValue();
56201
56202 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56203 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56204
56205 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56206 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56207 // Fill the new elements with Zero.
56208 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56209 // Compute the step to place the elements in the right place and control the
56210 // iteration.
56211 unsigned step = WidenNumElts / NumElts;
56212 if (WidenVT.is128BitVector()) {
56213 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56214 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56215 i--, j -= step) {
56216 SDValue temp = NewOps[i];
56217 NewOps[i] = NewOps[j];
56218 NewOps[j] = temp;
56219 }
56220 // Create new build vector with WidenVT and NewOps
56221 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56222 // Replace the old build vector with the new one. Bitcast the
56223 // new build vector to the type of the zext.
56224 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56225 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56226 return NewBV;
56227 }
56228 }
56229 return SDValue();
56230}
56231
56234 const X86Subtarget &Subtarget) {
56235 SDLoc dl(N);
56236 SDValue N0 = N->getOperand(0);
56237 EVT VT = N->getValueType(0);
56238
56239 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56240 // FIXME: Is this needed? We don't seem to have any tests for it.
56241 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56243 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56244 N0->getOperand(1));
56245 bool ReplaceOtherUses = !N0.hasOneUse();
56246 DCI.CombineTo(N, Setcc);
56247 // Replace other uses with a truncate of the widened setcc_carry.
56248 if (ReplaceOtherUses) {
56249 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56250 N0.getValueType(), Setcc);
56251 DCI.CombineTo(N0.getNode(), Trunc);
56252 }
56253
56254 return SDValue(N, 0);
56255 }
56256
56257 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56258 return NewCMov;
56259
56260 if (DCI.isBeforeLegalizeOps())
56261 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56262 return V;
56263
56264 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56265 DAG, DCI, Subtarget))
56266 return V;
56267
56268 if (VT.isVector())
56269 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56270 return R;
56271
56272 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56273 return NewAdd;
56274
56275 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56276 return R;
56277
56278 // TODO: Combine with any target/faux shuffle.
56279 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56281 SDValue N00 = N0.getOperand(0);
56282 SDValue N01 = N0.getOperand(1);
56283 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56284 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56285 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56286 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56287 return concatSubVectors(N00, N01, DAG, dl);
56288 }
56289 }
56290
56291 if (SDValue V = widenBuildVec(N, DAG))
56292 return V;
56293
56294 return SDValue();
56295}
56296
56297/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56298/// pre-promote its result type since vXi1 vectors don't get promoted
56299/// during type legalization.
56302 const SDLoc &DL, SelectionDAG &DAG,
56303 const X86Subtarget &Subtarget) {
56304 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56305 VT.getVectorElementType() == MVT::i1 &&
56306 (OpVT.getVectorElementType() == MVT::i8 ||
56307 OpVT.getVectorElementType() == MVT::i16)) {
56308 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56309 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56310 }
56311 return SDValue();
56312}
56313
56314// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56315// eq/ne) is generated when using an integer as a mask. Instead of generating a
56316// broadcast + vptest, we can directly move the integer to a mask register.
56318 const SDLoc &DL, SelectionDAG &DAG,
56319 const X86Subtarget &Subtarget) {
56320 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56321 return SDValue();
56322
56323 if (!Subtarget.hasAVX512())
56324 return SDValue();
56325
56326 if (Op0.getOpcode() != ISD::AND)
56327 return SDValue();
56328
56329 SDValue Broadcast = Op0.getOperand(0);
56330 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56331 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56332 return SDValue();
56333
56334 SDValue Load = Op0.getOperand(1);
56335 EVT LoadVT = Load.getSimpleValueType();
56336
56337 APInt UndefElts;
56338 SmallVector<APInt, 32> EltBits;
56340 UndefElts, EltBits,
56341 /*AllowWholeUndefs*/ true,
56342 /*AllowPartialUndefs*/ false) ||
56343 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56344 return SDValue();
56345
56346 // Check if the constant pool contains only powers of 2 starting from some
56347 // 2^N. The table may also contain undefs because of widening of vector
56348 // operands.
56349 unsigned N = EltBits[0].logBase2();
56350 unsigned Len = UndefElts.getBitWidth();
56351 for (unsigned I = 1; I != Len; ++I) {
56352 if (UndefElts[I]) {
56353 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56354 return SDValue();
56355 break;
56356 }
56357
56358 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56359 return SDValue();
56360 }
56361
56362 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56363 SDValue BroadcastOp;
56364 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56365 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56366 Broadcast, DAG.getVectorIdxConstant(0, DL));
56367 } else {
56368 BroadcastOp = Broadcast.getOperand(0);
56369 if (BroadcastOp.getValueType().isVector())
56370 return SDValue();
56371 }
56372
56373 SDValue Masked = BroadcastOp;
56374 if (N != 0) {
56375 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56376 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56377
56378 if (NumDefinedElts > BroadcastOpBitWidth)
56379 return SDValue();
56380
56381 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56382 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56383 DAG.getConstant(N, DL, BroadcastOpVT));
56384 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56385 DAG.getConstant(Mask, DL, BroadcastOpVT));
56386 }
56387 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56388 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56389 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56390 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56391
56392 if (CC == ISD::SETEQ)
56393 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56394
56395 if (VT != MVT::v16i1)
56396 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56397 DAG.getVectorIdxConstant(0, DL));
56398
56399 return Bitcast;
56400}
56401
56404 const X86Subtarget &Subtarget) {
56405 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56406 const SDValue LHS = N->getOperand(0);
56407 const SDValue RHS = N->getOperand(1);
56408 EVT VT = N->getValueType(0);
56409 EVT OpVT = LHS.getValueType();
56410 SDLoc DL(N);
56411
56412 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56413 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56414 Subtarget))
56415 return V;
56416 }
56417
56418 if (VT == MVT::i1) {
56419 X86::CondCode X86CC;
56420 if (SDValue V =
56421 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56422 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56423 }
56424
56425 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56426 if (OpVT.isScalarInteger()) {
56427 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56428 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56429 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56430 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56431 if (N0.getOperand(0) == N1)
56432 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56433 N0.getOperand(1));
56434 if (N0.getOperand(1) == N1)
56435 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56436 N0.getOperand(0));
56437 }
56438 return SDValue();
56439 };
56440 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56441 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56442 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56443 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56444
56445 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56446 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56447 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56448 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56449 if (N0.getOperand(0) == N1)
56450 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56451 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56452 if (N0.getOperand(1) == N1)
56453 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56454 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56455 }
56456 return SDValue();
56457 };
56458 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56459 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56460 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56461 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56462
56463 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56464 // cmpne(trunc(x),C) --> cmpne(x,C)
56465 // iff x upper bits are zero.
56466 if (LHS.getOpcode() == ISD::TRUNCATE &&
56467 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56469 EVT SrcVT = LHS.getOperand(0).getValueType();
56471 OpVT.getScalarSizeInBits());
56472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56473 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56474 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56475 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56476 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56477 }
56478
56479 // With C as a power of 2 and C != 0 and C != INT_MIN:
56480 // icmp eq Abs(X) C ->
56481 // (icmp eq A, C) | (icmp eq A, -C)
56482 // icmp ne Abs(X) C ->
56483 // (icmp ne A, C) & (icmp ne A, -C)
56484 // Both of these patterns can be better optimized in
56485 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56486 // integers which is checked above.
56487 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56488 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56489 const APInt &CInt = C->getAPIntValue();
56490 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56491 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56492 SDValue BaseOp = LHS.getOperand(0);
56493 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56494 SDValue SETCC1 = DAG.getSetCC(
56495 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56496 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56497 SETCC0, SETCC1);
56498 }
56499 }
56500 }
56501 }
56502 }
56503
56504 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56505 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56506 // Using temporaries to avoid messing up operand ordering for later
56507 // transformations if this doesn't work.
56508 SDValue Op0 = LHS;
56509 SDValue Op1 = RHS;
56510 ISD::CondCode TmpCC = CC;
56511 // Put build_vector on the right.
56512 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56513 std::swap(Op0, Op1);
56514 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56515 }
56516
56517 bool IsSEXT0 =
56518 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56519 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56520 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56521
56522 if (IsSEXT0 && IsVZero1) {
56523 assert(VT == Op0.getOperand(0).getValueType() &&
56524 "Unexpected operand type");
56525 if (TmpCC == ISD::SETGT)
56526 return DAG.getConstant(0, DL, VT);
56527 if (TmpCC == ISD::SETLE)
56528 return DAG.getConstant(1, DL, VT);
56529 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56530 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56531
56532 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56533 "Unexpected condition code!");
56534 return Op0.getOperand(0);
56535 }
56536
56537 if (IsVZero1)
56538 if (SDValue V =
56539 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56540 return V;
56541 }
56542
56543 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56544 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56545 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56546 // a mask, there are signed AVX512 comparisons).
56547 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56548 bool CanMakeSigned = false;
56549 if (ISD::isUnsignedIntSetCC(CC)) {
56550 KnownBits CmpKnown =
56552 // If we know LHS/RHS share the same sign bit at each element we can
56553 // make this signed.
56554 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56555 // across all lanes. So a pattern where the sign varies from lane to
56556 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56557 // missed. We could get around this by demanding each lane
56558 // independently, but this isn't the most important optimization and
56559 // that may eat into compile time.
56560 CanMakeSigned =
56561 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56562 }
56563 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56564 SDValue LHSOut = LHS;
56565 SDValue RHSOut = RHS;
56566 ISD::CondCode NewCC = CC;
56567 switch (CC) {
56568 case ISD::SETGE:
56569 case ISD::SETUGE:
56570 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56571 /*NSW*/ true))
56572 LHSOut = NewLHS;
56573 else if (SDValue NewRHS = incDecVectorConstant(
56574 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56575 RHSOut = NewRHS;
56576 else
56577 break;
56578
56579 [[fallthrough]];
56580 case ISD::SETUGT:
56581 NewCC = ISD::SETGT;
56582 break;
56583
56584 case ISD::SETLE:
56585 case ISD::SETULE:
56586 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56587 /*NSW*/ true))
56588 LHSOut = NewLHS;
56589 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56590 /*NSW*/ true))
56591 RHSOut = NewRHS;
56592 else
56593 break;
56594
56595 [[fallthrough]];
56596 case ISD::SETULT:
56597 // Will be swapped to SETGT in LowerVSETCC*.
56598 NewCC = ISD::SETLT;
56599 break;
56600 default:
56601 break;
56602 }
56603 if (NewCC != CC) {
56604 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56605 NewCC, DL, DAG, Subtarget))
56606 return R;
56607 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56608 }
56609 }
56610 }
56611
56612 if (SDValue R =
56613 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56614 return R;
56615
56616 // In the middle end transforms:
56617 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56618 // -> `(icmp ult (add x, -C), 2)`
56619 // Likewise inverted cases with `ugt`.
56620 //
56621 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56622 // in worse codegen. So, undo the middle-end transform and go back to `(or
56623 // (icmp eq), (icmp eq))` form.
56624 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56625 // the xmm approach.
56626 //
56627 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56628 // ne))` as it doesn't end up instruction positive.
56629 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56630 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56631 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56632 !Subtarget.hasAVX512() &&
56633 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56634 Subtarget.hasAVX2()) &&
56635 LHS.hasOneUse()) {
56636
56637 APInt CmpC;
56638 SDValue AddC = LHS.getOperand(1);
56639 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56641 // See which form we have depending on the constant/condition.
56642 SDValue C0 = SDValue();
56643 SDValue C1 = SDValue();
56644
56645 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56646 // we will end up generating an additional constant. Keeping in the
56647 // current form has a slight latency cost, but it probably worth saving a
56648 // constant.
56651 // Pass
56652 }
56653 // Normal Cases
56654 else if ((CC == ISD::SETULT && CmpC == 2) ||
56655 (CC == ISD::SETULE && CmpC == 1)) {
56656 // These will constant fold.
56657 C0 = DAG.getNegative(AddC, DL, OpVT);
56658 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56659 DAG.getAllOnesConstant(DL, OpVT));
56660 }
56661 // Inverted Cases
56662 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56663 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56664 // These will constant fold.
56665 C0 = DAG.getNOT(DL, AddC, OpVT);
56666 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56667 DAG.getAllOnesConstant(DL, OpVT));
56668 }
56669 if (C0 && C1) {
56670 SDValue NewLHS =
56671 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56672 SDValue NewRHS =
56673 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56674 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56675 }
56676 }
56677 }
56678
56679 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56680 // to avoid scalarization via legalization because v4i32 is not a legal type.
56681 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56682 LHS.getValueType() == MVT::v4f32)
56683 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56684
56685 // X pred 0.0 --> X pred -X
56686 // If the negation of X already exists, use it in the comparison. This removes
56687 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56688 // instructions in patterns with a 'select' node.
56690 SDVTList FNegVT = DAG.getVTList(OpVT);
56691 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56692 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56693 }
56694
56695 return SDValue();
56696}
56697
56700 const X86Subtarget &Subtarget) {
56701 SDValue Src = N->getOperand(0);
56702 MVT SrcVT = Src.getSimpleValueType();
56703 MVT VT = N->getSimpleValueType(0);
56704 unsigned NumBits = VT.getScalarSizeInBits();
56705 unsigned NumElts = SrcVT.getVectorNumElements();
56706 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56707 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56708
56709 // Perform constant folding.
56710 APInt UndefElts;
56711 SmallVector<APInt, 32> EltBits;
56712 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56713 /*AllowWholeUndefs*/ true,
56714 /*AllowPartialUndefs*/ true)) {
56715 APInt Imm(32, 0);
56716 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56717 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56718 Imm.setBit(Idx);
56719
56720 return DAG.getConstant(Imm, SDLoc(N), VT);
56721 }
56722
56723 // Look through int->fp bitcasts that don't change the element width.
56724 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56725 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56726 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56727 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56728
56729 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56730 // with scalar comparisons.
56731 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56732 SDLoc DL(N);
56733 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56734 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56735 return DAG.getNode(ISD::XOR, DL, VT,
56736 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56737 DAG.getConstant(NotMask, DL, VT));
56738 }
56739
56740 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56741 // results with scalar comparisons.
56742 if (Src.getOpcode() == X86ISD::PCMPGT &&
56743 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56744 SDLoc DL(N);
56745 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56746 return DAG.getNode(ISD::XOR, DL, VT,
56747 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56748 DAG.getConstant(NotMask, DL, VT));
56749 }
56750
56751 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56752 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56753 // iff pow2splat(c1).
56754 // Use KnownBits to determine if only a single bit is non-zero
56755 // in each element (pow2 or zero), and shift that bit to the msb.
56756 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56757 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56758 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56759 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56760 if (KnownLHS.countMaxPopulation() == 1 &&
56761 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56762 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56763 SDLoc DL(N);
56764 MVT ShiftVT = SrcVT;
56765 SDValue ShiftLHS = Src.getOperand(0);
56766 SDValue ShiftRHS = Src.getOperand(1);
56767 if (ShiftVT.getScalarType() == MVT::i8) {
56768 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56769 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56770 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56771 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56772 }
56773 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56774 ShiftLHS, ShiftAmt, DAG);
56775 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56776 ShiftRHS, ShiftAmt, DAG);
56777 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56778 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56779 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56780 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56781 }
56782 }
56783
56784 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56785 if (N->isOnlyUserOf(Src.getNode())) {
56787 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56788 APInt UndefElts;
56789 SmallVector<APInt, 32> EltBits;
56790 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56791 UndefElts, EltBits)) {
56792 APInt Mask = APInt::getZero(NumBits);
56793 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56794 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56795 Mask.setBit(Idx);
56796 }
56797 SDLoc DL(N);
56798 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56799 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56800 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56801 DAG.getConstant(Mask, DL, VT));
56802 }
56803 }
56804 }
56805
56806 // Simplify the inputs.
56807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56808 APInt DemandedMask(APInt::getAllOnes(NumBits));
56809 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56810 return SDValue(N, 0);
56811
56812 return SDValue();
56813}
56814
56817 const X86Subtarget &Subtarget) {
56818 MVT VT = N->getSimpleValueType(0);
56819 unsigned NumBits = VT.getScalarSizeInBits();
56820
56821 // Simplify the inputs.
56822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56823 APInt DemandedMask(APInt::getAllOnes(NumBits));
56824 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56825 return SDValue(N, 0);
56826
56827 return SDValue();
56828}
56829
56833 SDValue Mask = MemOp->getMask();
56834
56835 // With vector masks we only demand the upper bit of the mask.
56836 if (Mask.getScalarValueSizeInBits() != 1) {
56837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56838 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56839 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56840 if (N->getOpcode() != ISD::DELETED_NODE)
56841 DCI.AddToWorklist(N);
56842 return SDValue(N, 0);
56843 }
56844 }
56845
56846 return SDValue();
56847}
56848
56850 SDValue Index, SDValue Base, SDValue Scale,
56851 SelectionDAG &DAG) {
56852 SDLoc DL(GorS);
56853
56854 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56855 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56856 Gather->getMask(), Base, Index, Scale } ;
56857 return DAG.getMaskedGather(Gather->getVTList(),
56858 Gather->getMemoryVT(), DL, Ops,
56859 Gather->getMemOperand(),
56860 Gather->getIndexType(),
56861 Gather->getExtensionType());
56862 }
56863 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56864 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56865 Scatter->getMask(), Base, Index, Scale };
56866 return DAG.getMaskedScatter(Scatter->getVTList(),
56867 Scatter->getMemoryVT(), DL,
56868 Ops, Scatter->getMemOperand(),
56869 Scatter->getIndexType(),
56870 Scatter->isTruncatingStore());
56871}
56872
56875 SDLoc DL(N);
56876 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56877 SDValue Index = GorS->getIndex();
56878 SDValue Base = GorS->getBasePtr();
56879 SDValue Scale = GorS->getScale();
56880 EVT IndexVT = Index.getValueType();
56881 EVT IndexSVT = IndexVT.getVectorElementType();
56882 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56884 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56885
56886 if (DCI.isBeforeLegalize()) {
56887 // Attempt to move shifted index into the address scale, allows further
56888 // index truncation below.
56889 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56890 isa<ConstantSDNode>(Scale)) {
56891 unsigned ScaleAmt = Scale->getAsZExtVal();
56892 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56893 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56894 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56895 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56896 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56897 if (N->getOpcode() != ISD::DELETED_NODE)
56898 DCI.AddToWorklist(N);
56899 return SDValue(N, 0);
56900 }
56901 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56902 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56903 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56904 SDValue ShAmt = Index.getOperand(1);
56905 SDValue NewShAmt =
56906 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56907 DAG.getConstant(1, DL, ShAmt.getValueType()));
56908 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56909 Index.getOperand(0), NewShAmt);
56910 SDValue NewScale =
56911 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56912 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56913 }
56914 }
56915 }
56916
56917 // Shrink indices if they are larger than 32-bits.
56918 // Only do this before legalize types since v2i64 could become v2i32.
56919 // FIXME: We could check that the type is legal if we're after legalize
56920 // types, but then we would need to construct test cases where that happens.
56921 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56922 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56923
56924 // FIXME: We could support more than just constant fold, but we need to
56925 // careful with costing. A truncate that can be optimized out would be
56926 // fine. Otherwise we might only want to create a truncate if it avoids
56927 // a split.
56928 if (SDValue TruncIndex =
56929 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56930 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56931
56932 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56933 // there are sufficient sign bits. Only do this before legalize types to
56934 // avoid creating illegal types in truncate.
56935 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56936 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56937 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56938 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56939 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56940 }
56941
56942 // Shrink if we remove an illegal type.
56943 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56944 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56945 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56946 }
56947 }
56948 }
56949
56950 // Try to move splat adders from the index operand to the base
56951 // pointer operand. Taking care to multiply by the scale. We can only do
56952 // this when index element type is the same as the pointer type.
56953 // Otherwise we need to be sure the math doesn't wrap before the scale.
56954 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56955 isa<ConstantSDNode>(Scale)) {
56956 uint64_t ScaleAmt = Scale->getAsZExtVal();
56957
56958 for (unsigned I = 0; I != 2; ++I)
56959 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56960 BitVector UndefElts;
56961 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56962 if (UndefElts.none()) {
56963 // If the splat value is constant we can add the scaled splat value
56964 // to the existing base.
56965 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56966 APInt Adder = C->getAPIntValue() * ScaleAmt;
56967 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56968 DAG.getConstant(Adder, DL, PtrVT));
56969 SDValue NewIndex = Index.getOperand(1 - I);
56970 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56971 }
56972 // For non-constant cases, limit this to non-scaled cases.
56973 if (ScaleAmt == 1) {
56974 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56975 SDValue NewIndex = Index.getOperand(1 - I);
56976 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56977 }
56978 }
56979 }
56980 // It's also possible base is just a constant. In that case, just
56981 // replace it with 0 and move the displacement into the index.
56982 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56983 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56984 // Combine the constant build_vector and the constant base.
56985 Splat =
56986 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56987 // Add to the other half of the original Index add.
56988 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56989 Index.getOperand(1 - I), Splat);
56990 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56991 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56992 }
56993 }
56994 }
56995
56996 if (DCI.isBeforeLegalizeOps()) {
56997 // Make sure the index is either i32 or i64
56998 if (IndexWidth != 32 && IndexWidth != 64) {
56999 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57000 IndexVT = IndexVT.changeVectorElementType(EltVT);
57001 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57002 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57003 }
57004 }
57005
57006 // With vector masks we only demand the upper bit of the mask.
57007 SDValue Mask = GorS->getMask();
57008 if (Mask.getScalarValueSizeInBits() != 1) {
57009 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57010 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57011 if (N->getOpcode() != ISD::DELETED_NODE)
57012 DCI.AddToWorklist(N);
57013 return SDValue(N, 0);
57014 }
57015 }
57016
57017 return SDValue();
57018}
57019
57020// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57022 const X86Subtarget &Subtarget) {
57023 SDLoc DL(N);
57024 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57025 SDValue EFLAGS = N->getOperand(1);
57026
57027 // Try to simplify the EFLAGS and condition code operands.
57028 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57029 return getSETCC(CC, Flags, DL, DAG);
57030
57031 return SDValue();
57032}
57033
57034/// Optimize branch condition evaluation.
57036 const X86Subtarget &Subtarget) {
57037 SDLoc DL(N);
57038 SDValue EFLAGS = N->getOperand(3);
57039 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57040
57041 // Try to simplify the EFLAGS and condition code operands.
57042 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57043 // RAUW them under us.
57044 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57045 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57046 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57047 N->getOperand(1), Cond, Flags);
57048 }
57049
57050 return SDValue();
57051}
57052
57053// TODO: Could we move this to DAGCombine?
57055 SelectionDAG &DAG) {
57056 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57057 // to optimize away operation when it's from a constant.
57058 //
57059 // The general transformation is:
57060 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57061 // AND(VECTOR_CMP(x,y), constant2)
57062 // constant2 = UNARYOP(constant)
57063
57064 // Early exit if this isn't a vector operation, the operand of the
57065 // unary operation isn't a bitwise AND, or if the sizes of the operations
57066 // aren't the same.
57067 EVT VT = N->getValueType(0);
57068 bool IsStrict = N->isStrictFPOpcode();
57069 unsigned NumEltBits = VT.getScalarSizeInBits();
57070 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57071 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57072 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57073 VT.getSizeInBits() != Op0.getValueSizeInBits())
57074 return SDValue();
57075
57076 // Now check that the other operand of the AND is a constant. We could
57077 // make the transformation for non-constant splats as well, but it's unclear
57078 // that would be a benefit as it would not eliminate any operations, just
57079 // perform one more step in scalar code before moving to the vector unit.
57080 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57081 // Bail out if the vector isn't a constant.
57082 if (!BV->isConstant())
57083 return SDValue();
57084
57085 // Everything checks out. Build up the new and improved node.
57086 SDLoc DL(N);
57087 EVT IntVT = BV->getValueType(0);
57088 // Create a new constant of the appropriate type for the transformed
57089 // DAG.
57090 SDValue SourceConst;
57091 if (IsStrict)
57092 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57093 {N->getOperand(0), SDValue(BV, 0)});
57094 else
57095 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57096 // The AND node needs bitcasts to/from an integer vector type around it.
57097 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57098 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57099 MaskConst);
57100 SDValue Res = DAG.getBitcast(VT, NewAnd);
57101 if (IsStrict)
57102 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57103 return Res;
57104 }
57105
57106 return SDValue();
57107}
57108
57109/// If we are converting a value to floating-point, try to replace scalar
57110/// truncate of an extracted vector element with a bitcast. This tries to keep
57111/// the sequence on XMM registers rather than moving between vector and GPRs.
57113 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57114 // to allow being called by any similar cast opcode.
57115 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57116 SDValue Trunc = N->getOperand(0);
57117 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57118 return SDValue();
57119
57120 SDValue ExtElt = Trunc.getOperand(0);
57121 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57122 !isNullConstant(ExtElt.getOperand(1)))
57123 return SDValue();
57124
57125 EVT TruncVT = Trunc.getValueType();
57126 EVT SrcVT = ExtElt.getValueType();
57127 unsigned DestWidth = TruncVT.getSizeInBits();
57128 unsigned SrcWidth = SrcVT.getSizeInBits();
57129 if (SrcWidth % DestWidth != 0)
57130 return SDValue();
57131
57132 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57133 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57134 unsigned VecWidth = SrcVecVT.getSizeInBits();
57135 unsigned NumElts = VecWidth / DestWidth;
57136 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57137 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57138 SDLoc DL(N);
57139 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57140 BitcastVec, ExtElt.getOperand(1));
57141 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57142}
57143
57145 const X86Subtarget &Subtarget) {
57146 bool IsStrict = N->isStrictFPOpcode();
57147 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57148 EVT VT = N->getValueType(0);
57149 EVT InVT = Op0.getValueType();
57150
57151 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57152 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57153 // if hasFP16 support:
57154 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57155 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57156 // else
57157 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57158 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57159 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57160 unsigned ScalarSize = InVT.getScalarSizeInBits();
57161 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57162 ScalarSize >= 64)
57163 return SDValue();
57164 SDLoc dl(N);
57165 EVT DstVT =
57167 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57168 : ScalarSize < 32 ? MVT::i32
57169 : MVT::i64,
57170 InVT.getVectorNumElements());
57171 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57172 if (IsStrict)
57173 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57174 {N->getOperand(0), P});
57175 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57176 }
57177
57178 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57179 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57180 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57181 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57182 VT.getScalarType() != MVT::f16) {
57183 SDLoc dl(N);
57184 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57185 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57186
57187 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57188 if (IsStrict)
57189 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57190 {N->getOperand(0), P});
57191 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57192 }
57193
57194 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57195 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57196 // the optimization here.
57197 SDNodeFlags Flags = N->getFlags();
57198 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57199 if (IsStrict)
57200 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57201 {N->getOperand(0), Op0});
57202 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57203 }
57204
57205 return SDValue();
57206}
57207
57210 const X86Subtarget &Subtarget) {
57211 // First try to optimize away the conversion entirely when it's
57212 // conditionally from a constant. Vectors only.
57213 bool IsStrict = N->isStrictFPOpcode();
57215 return Res;
57216
57217 // Now move on to more general possibilities.
57218 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57219 EVT VT = N->getValueType(0);
57220 EVT InVT = Op0.getValueType();
57221
57222 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57223 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57224 // if hasFP16 support:
57225 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57226 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57227 // else
57228 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57229 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57230 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57231 unsigned ScalarSize = InVT.getScalarSizeInBits();
57232 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57233 ScalarSize >= 64)
57234 return SDValue();
57235 SDLoc dl(N);
57236 EVT DstVT =
57238 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57239 : ScalarSize < 32 ? MVT::i32
57240 : MVT::i64,
57241 InVT.getVectorNumElements());
57242 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57243 if (IsStrict)
57244 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57245 {N->getOperand(0), P});
57246 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57247 }
57248
57249 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57250 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57251 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57252 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57253 VT.getScalarType() != MVT::f16) {
57254 SDLoc dl(N);
57255 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57256 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57257 if (IsStrict)
57258 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57259 {N->getOperand(0), P});
57260 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57261 }
57262
57263 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57264 // vectors and scalars, see if we know that the upper bits are all the sign
57265 // bit, in which case we can truncate the input to i32 and convert from that.
57266 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57267 unsigned BitWidth = InVT.getScalarSizeInBits();
57268 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57269 if (NumSignBits >= (BitWidth - 31)) {
57270 EVT TruncVT = MVT::i32;
57271 if (InVT.isVector())
57272 TruncVT = InVT.changeVectorElementType(TruncVT);
57273 SDLoc dl(N);
57274 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57275 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57276 if (IsStrict)
57277 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57278 {N->getOperand(0), Trunc});
57279 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57280 }
57281 // If we're after legalize and the type is v2i32 we need to shuffle and
57282 // use CVTSI2P.
57283 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57284 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57285 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57286 { 0, 2, -1, -1 });
57287 if (IsStrict)
57288 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57289 {N->getOperand(0), Shuf});
57290 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57291 }
57292 }
57293
57294 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57295 // a 32-bit target where SSE doesn't support i64->FP operations.
57296 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57297 Op0.getOpcode() == ISD::LOAD) {
57298 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57299
57300 // This transformation is not supported if the result type is f16 or f128.
57301 if (VT == MVT::f16 || VT == MVT::f128)
57302 return SDValue();
57303
57304 // If we have AVX512DQ we can use packed conversion instructions unless
57305 // the VT is f80.
57306 if (Subtarget.hasDQI() && VT != MVT::f80)
57307 return SDValue();
57308
57309 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57310 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57311 std::pair<SDValue, SDValue> Tmp =
57312 Subtarget.getTargetLowering()->BuildFILD(
57313 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57314 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57315 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57316 return Tmp.first;
57317 }
57318 }
57319
57320 if (IsStrict)
57321 return SDValue();
57322
57323 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57324 return V;
57325
57326 return SDValue();
57327}
57328
57330 const X86Subtarget &Subtarget) {
57331 EVT VT = N->getValueType(0);
57332 SDValue Src = N->getOperand(0);
57333 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57334 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57335 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57336
57337 return SDValue();
57338}
57339
57340// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57342 const X86Subtarget &Subtarget) {
57343 if (!Subtarget.hasAVX10_2())
57344 return SDValue();
57345
57346 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57347 EVT SrcVT = N->getOperand(0).getValueType();
57348 EVT DstVT = N->getValueType(0);
57349 SDLoc dl(N);
57350
57351 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57352 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57353
57354 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57355 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57356 N->getOperand(0), V2F32Value);
57357
57358 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57359 if (IsSigned)
57360 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57361
57362 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57363 }
57364 return SDValue();
57365}
57366
57368 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57369
57370 for (const SDNode *User : Flags->users()) {
57371 X86::CondCode CC;
57372 switch (User->getOpcode()) {
57373 default:
57374 // Be conservative.
57375 return true;
57376 case X86ISD::SETCC:
57378 CC = (X86::CondCode)User->getConstantOperandVal(0);
57379 break;
57380 case X86ISD::BRCOND:
57381 case X86ISD::CMOV:
57382 CC = (X86::CondCode)User->getConstantOperandVal(2);
57383 break;
57384 }
57385
57386 switch (CC) {
57387 // clang-format off
57388 default: break;
57389 case X86::COND_A: case X86::COND_AE:
57390 case X86::COND_B: case X86::COND_BE:
57391 case X86::COND_O: case X86::COND_NO:
57392 case X86::COND_G: case X86::COND_GE:
57393 case X86::COND_L: case X86::COND_LE:
57394 return true;
57395 // clang-format on
57396 }
57397 }
57398
57399 return false;
57400}
57401
57402static bool onlyZeroFlagUsed(SDValue Flags) {
57403 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57404
57405 for (const SDNode *User : Flags->users()) {
57406 unsigned CCOpNo;
57407 switch (User->getOpcode()) {
57408 default:
57409 // Be conservative.
57410 return false;
57411 case X86ISD::SETCC:
57413 CCOpNo = 0;
57414 break;
57415 case X86ISD::BRCOND:
57416 case X86ISD::CMOV:
57417 CCOpNo = 2;
57418 break;
57419 }
57420
57421 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57422 if (CC != X86::COND_E && CC != X86::COND_NE)
57423 return false;
57424 }
57425
57426 return true;
57427}
57428
57431 const X86Subtarget &Subtarget) {
57432 // Only handle test patterns.
57433 if (!isNullConstant(N->getOperand(1)))
57434 return SDValue();
57435
57436 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57437 // and use its flags directly.
57438 // TODO: Maybe we should try promoting compares that only use the zero flag
57439 // first if we can prove the upper bits with computeKnownBits?
57440 SDLoc dl(N);
57441 SDValue Op = N->getOperand(0);
57442 EVT VT = Op.getValueType();
57443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57444
57445 if (SDValue CMP =
57446 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57447 return CMP;
57448
57449 // If we have a constant logical shift that's only used in a comparison
57450 // against zero turn it into an equivalent AND. This allows turning it into
57451 // a TEST instruction later.
57452 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57453 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57454 onlyZeroFlagUsed(SDValue(N, 0))) {
57455 unsigned BitWidth = VT.getSizeInBits();
57456 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57457 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57458 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57459 APInt Mask = Op.getOpcode() == ISD::SRL
57460 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57461 : APInt::getLowBitsSet(BitWidth, MaskBits);
57462 if (Mask.isSignedIntN(32)) {
57463 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57464 DAG.getConstant(Mask, dl, VT));
57465 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57466 DAG.getConstant(0, dl, VT));
57467 }
57468 }
57469 }
57470
57471 // If we're extracting from a avx512 bool vector and comparing against zero,
57472 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57473 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57474 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57475 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57476 SDValue Src = Op.getOperand(0);
57477 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57478 isNullConstant(Src.getOperand(1)) &&
57479 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57480 SDValue BoolVec = Src.getOperand(0);
57481 unsigned ShAmt = 0;
57482 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57483 ShAmt = BoolVec.getConstantOperandVal(1);
57484 BoolVec = BoolVec.getOperand(0);
57485 }
57486 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57487 EVT VecVT = BoolVec.getValueType();
57488 unsigned BitWidth = VecVT.getVectorNumElements();
57489 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57490 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57491 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57492 Op = DAG.getBitcast(BCVT, BoolVec);
57493 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57494 DAG.getConstant(Mask, dl, BCVT));
57495 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57496 DAG.getConstant(0, dl, BCVT));
57497 }
57498 }
57499 }
57500
57501 // Peek through any zero-extend if we're only testing for a zero result.
57502 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57503 SDValue Src = Op.getOperand(0);
57504 EVT SrcVT = Src.getValueType();
57505 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57506 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57507 DAG.getConstant(0, dl, SrcVT));
57508 }
57509
57510 // Look for a truncate.
57511 if (Op.getOpcode() != ISD::TRUNCATE)
57512 return SDValue();
57513
57514 SDValue Trunc = Op;
57515 Op = Op.getOperand(0);
57516
57517 // See if we can compare with zero against the truncation source,
57518 // which should help using the Z flag from many ops. Only do this for
57519 // i32 truncated op to prevent partial-reg compares of promoted ops.
57520 EVT OpVT = Op.getValueType();
57521 APInt UpperBits =
57523 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57524 onlyZeroFlagUsed(SDValue(N, 0))) {
57525 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57526 DAG.getConstant(0, dl, OpVT));
57527 }
57528
57529 // After this the truncate and arithmetic op must have a single use.
57530 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57531 return SDValue();
57532
57533 unsigned NewOpc;
57534 switch (Op.getOpcode()) {
57535 default: return SDValue();
57536 case ISD::AND:
57537 // Skip and with constant. We have special handling for and with immediate
57538 // during isel to generate test instructions.
57539 if (isa<ConstantSDNode>(Op.getOperand(1)))
57540 return SDValue();
57541 NewOpc = X86ISD::AND;
57542 break;
57543 case ISD::OR: NewOpc = X86ISD::OR; break;
57544 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57545 case ISD::ADD:
57546 // If the carry or overflow flag is used, we can't truncate.
57548 return SDValue();
57549 NewOpc = X86ISD::ADD;
57550 break;
57551 case ISD::SUB:
57552 // If the carry or overflow flag is used, we can't truncate.
57554 return SDValue();
57555 NewOpc = X86ISD::SUB;
57556 break;
57557 }
57558
57559 // We found an op we can narrow. Truncate its inputs.
57560 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57561 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57562
57563 // Use a X86 specific opcode to avoid DAG combine messing with it.
57564 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57565 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57566
57567 // For AND, keep a CMP so that we can match the test pattern.
57568 if (NewOpc == X86ISD::AND)
57569 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57570 DAG.getConstant(0, dl, VT));
57571
57572 // Return the flags.
57573 return Op.getValue(1);
57574}
57575
57578 const X86Subtarget &ST) {
57579 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57580 "Expected X86ISD::ADD or X86ISD::SUB");
57581
57582 SDLoc DL(N);
57583 SDValue LHS = N->getOperand(0);
57584 SDValue RHS = N->getOperand(1);
57585 MVT VT = LHS.getSimpleValueType();
57586 bool IsSub = X86ISD::SUB == N->getOpcode();
57587 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57588
57589 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57590 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57591 return CMP;
57592
57593 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57594 if (!N->hasAnyUseOfValue(1)) {
57595 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57596 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57597 }
57598
57599 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57600 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57601 SDValue Ops[] = {N0, N1};
57602 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57603 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57604 SDValue Op(N, 0);
57605 if (Negate) {
57606 // Bail if this is only used by a user of the x86 add/sub.
57607 if (GenericAddSub->hasOneUse() &&
57608 GenericAddSub->user_begin()->isOnlyUserOf(N))
57609 return;
57610 Op = DAG.getNegative(Op, DL, VT);
57611 }
57612 DCI.CombineTo(GenericAddSub, Op);
57613 }
57614 };
57615 MatchGeneric(LHS, RHS, false);
57616 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57617
57618 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57619 // EFLAGS result doesn't change.
57620 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57621 /*ZeroSecondOpOnly*/ true);
57622}
57623
57625 SDValue LHS = N->getOperand(0);
57626 SDValue RHS = N->getOperand(1);
57627 SDValue BorrowIn = N->getOperand(2);
57628
57629 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57630 MVT VT = N->getSimpleValueType(0);
57631 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57632 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57633 }
57634
57635 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57636 // iff the flag result is dead.
57637 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57638 !N->hasAnyUseOfValue(1))
57639 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57640 LHS.getOperand(1), BorrowIn);
57641
57642 return SDValue();
57643}
57644
57645// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57648 SDValue LHS = N->getOperand(0);
57649 SDValue RHS = N->getOperand(1);
57650 SDValue CarryIn = N->getOperand(2);
57651 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57652 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57653
57654 // Canonicalize constant to RHS.
57655 if (LHSC && !RHSC)
57656 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57657 CarryIn);
57658
57659 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57660 // the result is either zero or one (depending on the input carry bit).
57661 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57662 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57663 // We don't have a good way to replace an EFLAGS use, so only do this when
57664 // dead right now.
57665 SDValue(N, 1).use_empty()) {
57666 SDLoc DL(N);
57667 EVT VT = N->getValueType(0);
57668 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57669 SDValue Res1 = DAG.getNode(
57670 ISD::AND, DL, VT,
57672 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57673 DAG.getConstant(1, DL, VT));
57674 return DCI.CombineTo(N, Res1, CarryOut);
57675 }
57676
57677 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57678 // iff the flag result is dead.
57679 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57680 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57681 SDLoc DL(N);
57682 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57683 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57684 DAG.getConstant(0, DL, LHS.getValueType()),
57685 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57686 }
57687
57688 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57689 MVT VT = N->getSimpleValueType(0);
57690 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57691 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57692 }
57693
57694 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57695 // iff the flag result is dead.
57696 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57697 !N->hasAnyUseOfValue(1))
57698 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57699 LHS.getOperand(1), CarryIn);
57700
57701 return SDValue();
57702}
57703
57705 const SDLoc &DL, EVT VT,
57706 const X86Subtarget &Subtarget) {
57707 using namespace SDPatternMatch;
57708
57709 // Example of pattern we try to detect:
57710 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57711 //(add (build_vector (extract_elt t, 0),
57712 // (extract_elt t, 2),
57713 // (extract_elt t, 4),
57714 // (extract_elt t, 6)),
57715 // (build_vector (extract_elt t, 1),
57716 // (extract_elt t, 3),
57717 // (extract_elt t, 5),
57718 // (extract_elt t, 7)))
57719
57720 if (!Subtarget.hasSSE2())
57721 return SDValue();
57722
57723 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57724 VT.getVectorNumElements() < 4 ||
57726 return SDValue();
57727
57728 SDValue Op0, Op1, Accum;
57733 m_Value(Op1))))))
57734 return SDValue();
57735
57736 // Check if one of Op0,Op1 is of the form:
57737 // (build_vector (extract_elt Mul, 0),
57738 // (extract_elt Mul, 2),
57739 // (extract_elt Mul, 4),
57740 // ...
57741 // the other is of the form:
57742 // (build_vector (extract_elt Mul, 1),
57743 // (extract_elt Mul, 3),
57744 // (extract_elt Mul, 5),
57745 // ...
57746 // and identify Mul.
57747 SDValue Mul;
57748 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57749 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57750 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57751 // TODO: Be more tolerant to undefs.
57752 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57753 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57754 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57755 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57756 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57757 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57758 return SDValue();
57759 // Commutativity of mul allows factors of a product to reorder.
57760 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57761 std::swap(Idx0L, Idx1L);
57762 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57763 std::swap(Idx0H, Idx1H);
57764 // Commutativity of add allows pairs of factors to reorder.
57765 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57766 std::swap(Idx0L, Idx0H);
57767 std::swap(Idx1L, Idx1H);
57768 }
57769 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57770 Idx1H != 2 * i + 3)
57771 return SDValue();
57772 if (!Mul) {
57773 // First time an extract_elt's source vector is visited. Must be a MUL
57774 // with 2X number of vector elements than the BUILD_VECTOR.
57775 // Both extracts must be from same MUL.
57776 Mul = Vec0L;
57777 if (Mul.getOpcode() != ISD::MUL ||
57778 Mul.getValueType().getVectorNumElements() != 2 * e)
57779 return SDValue();
57780 }
57781 // Check that the extract is from the same MUL previously seen.
57782 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57783 return SDValue();
57784 }
57785
57786 // Check if the Mul source can be safely shrunk.
57788 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57790 return SDValue();
57791
57792 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57793 VT.getVectorNumElements() * 2);
57794 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57795 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57796
57797 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57799 EVT InVT = Ops[0].getValueType();
57800 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57801 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57802 InVT.getVectorNumElements() / 2);
57803 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57804 };
57805 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57806 if (Accum)
57807 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57808 return R;
57809}
57810
57811// Attempt to turn this pattern into PMADDWD.
57812// (add (mul (sext (build_vector)), (sext (build_vector))),
57813// (mul (sext (build_vector)), (sext (build_vector)))
57815 const SDLoc &DL, EVT VT,
57816 const X86Subtarget &Subtarget) {
57817 using namespace SDPatternMatch;
57818
57819 if (!Subtarget.hasSSE2())
57820 return SDValue();
57821
57822 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57823 VT.getVectorNumElements() < 4 ||
57825 return SDValue();
57826
57827 // All inputs need to be sign extends.
57828 // TODO: Support ZERO_EXTEND from known positive?
57829 SDValue N00, N01, N10, N11;
57830 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57831 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57832 return SDValue();
57833
57834 // Must be extending from vXi16.
57835 EVT InVT = N00.getValueType();
57836 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57837 N10.getValueType() != InVT || N11.getValueType() != InVT)
57838 return SDValue();
57839
57840 // All inputs should be build_vectors.
57841 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57842 N01.getOpcode() != ISD::BUILD_VECTOR ||
57843 N10.getOpcode() != ISD::BUILD_VECTOR ||
57845 return SDValue();
57846
57847 // For each element, we need to ensure we have an odd element from one vector
57848 // multiplied by the odd element of another vector and the even element from
57849 // one of the same vectors being multiplied by the even element from the
57850 // other vector. So we need to make sure for each element i, this operator
57851 // is being performed:
57852 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57853 SDValue In0, In1;
57854 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57855 SDValue N00Elt = N00.getOperand(i);
57856 SDValue N01Elt = N01.getOperand(i);
57857 SDValue N10Elt = N10.getOperand(i);
57858 SDValue N11Elt = N11.getOperand(i);
57859 // TODO: Be more tolerant to undefs.
57860 SDValue N00In, N01In, N10In, N11In;
57861 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57862 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57863 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57864 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57865 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57866 return SDValue();
57867 // Add is commutative so indices can be reordered.
57868 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57869 std::swap(IdxN00, IdxN10);
57870 std::swap(IdxN01, IdxN11);
57871 }
57872 // N0 indices be the even element. N1 indices must be the next odd element.
57873 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57874 IdxN11 != 2 * i + 1)
57875 return SDValue();
57876
57877 // First time we find an input capture it.
57878 if (!In0) {
57879 In0 = N00In;
57880 In1 = N01In;
57881
57882 // The input vectors must be at least as wide as the output.
57883 // If they are larger than the output, we extract subvector below.
57884 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57885 In1.getValueSizeInBits() < VT.getSizeInBits())
57886 return SDValue();
57887 }
57888 // Mul is commutative so the input vectors can be in any order.
57889 // Canonicalize to make the compares easier.
57890 if (In0 != N00In)
57891 std::swap(N00In, N01In);
57892 if (In0 != N10In)
57893 std::swap(N10In, N11In);
57894 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57895 return SDValue();
57896 }
57897
57898 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57900 EVT OpVT = Ops[0].getValueType();
57901 assert(OpVT.getScalarType() == MVT::i16 &&
57902 "Unexpected scalar element type");
57903 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57904 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57905 OpVT.getVectorNumElements() / 2);
57906 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57907 };
57908
57909 // If the output is narrower than an input, extract the low part of the input
57910 // vector.
57911 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57912 VT.getVectorNumElements() * 2);
57913 if (OutVT16.bitsLT(In0.getValueType())) {
57914 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57915 DAG.getVectorIdxConstant(0, DL));
57916 }
57917 if (OutVT16.bitsLT(In1.getValueType())) {
57918 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57919 DAG.getVectorIdxConstant(0, DL));
57920 }
57921 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57922 PMADDBuilder);
57923}
57924
57925// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57926// If upper element in each pair of both VPMADDWD are zero then we can merge
57927// the operand elements and use the implicit add of VPMADDWD.
57928// TODO: Add support for VPMADDUBSW (which isn't commutable).
57930 const SDLoc &DL, EVT VT) {
57931 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57932 return SDValue();
57933
57934 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57935 if (VT.getSizeInBits() > 128)
57936 return SDValue();
57937
57938 unsigned NumElts = VT.getVectorNumElements();
57939 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57941 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57942
57943 bool Op0HiZero =
57944 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57945 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57946 bool Op1HiZero =
57947 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57948 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57949
57950 // TODO: Check for zero lower elements once we have actual codegen that
57951 // creates them.
57952 if (!Op0HiZero || !Op1HiZero)
57953 return SDValue();
57954
57955 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57956 SmallVector<int> Mask;
57957 for (int i = 0; i != (int)NumElts; ++i) {
57958 Mask.push_back(2 * i);
57959 Mask.push_back(2 * (i + NumElts));
57960 }
57961
57962 SDValue LHS =
57963 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57964 SDValue RHS =
57965 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57966 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57967}
57968
57969/// CMOV of constants requires materializing constant operands in registers.
57970/// Try to fold those constants into an 'add' instruction to reduce instruction
57971/// count. We do this with CMOV rather the generic 'select' because there are
57972/// earlier folds that may be used to turn select-of-constants into logic hacks.
57974 SelectionDAG &DAG,
57975 const X86Subtarget &Subtarget) {
57976 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57977 // better because we eliminate 1-2 instructions. This transform is still
57978 // an improvement without zero operands because we trade 2 move constants and
57979 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57980 // immediate asm operands (fit in 32-bits).
57981 auto isSuitableCmov = [](SDValue V) {
57982 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57983 return false;
57984 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57985 !isa<ConstantSDNode>(V.getOperand(1)))
57986 return false;
57987 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57988 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57989 V.getConstantOperandAPInt(1).isSignedIntN(32));
57990 };
57991
57992 // Match an appropriate CMOV as the first operand of the add.
57993 SDValue Cmov = N->getOperand(0);
57994 SDValue OtherOp = N->getOperand(1);
57995 if (!isSuitableCmov(Cmov))
57996 std::swap(Cmov, OtherOp);
57997 if (!isSuitableCmov(Cmov))
57998 return SDValue();
57999
58000 // Don't remove a load folding opportunity for the add. That would neutralize
58001 // any improvements from removing constant materializations.
58002 if (X86::mayFoldLoad(OtherOp, Subtarget))
58003 return SDValue();
58004
58005 EVT VT = N->getValueType(0);
58006 SDValue FalseOp = Cmov.getOperand(0);
58007 SDValue TrueOp = Cmov.getOperand(1);
58008
58009 // We will push the add through the select, but we can potentially do better
58010 // if we know there is another add in the sequence and this is pointer math.
58011 // In that case, we can absorb an add into the trailing memory op and avoid
58012 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58013 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58014 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58015 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58016 all_of(N->users(), [&](SDNode *Use) {
58017 auto *MemNode = dyn_cast<MemSDNode>(Use);
58018 return MemNode && MemNode->getBasePtr().getNode() == N;
58019 })) {
58020 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58021 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58022 // it is possible that choosing op1 might be better.
58023 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58024 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58025 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58026 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58027 Cmov.getOperand(2), Cmov.getOperand(3));
58028 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58029 }
58030
58031 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58032 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58033 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58034 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58035 Cmov.getOperand(3));
58036}
58037
58038// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58039// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58041 EVT VT, const X86Subtarget &Subtarget) {
58042 using namespace SDPatternMatch;
58043 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58044 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58045 return SDValue();
58046
58047 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58048 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58049 VT.getSizeInBits() < 512)
58050 return SDValue();
58051
58052 const auto TotalSize = VT.getSizeInBits();
58053 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58054 return SDValue();
58055
58056 SDValue X, Y, Acc;
58057 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58058 return SDValue();
58059
58060 KnownBits KnownX = DAG.computeKnownBits(X);
58061 if (KnownX.countMinLeadingZeros() < 12)
58062 return SDValue();
58063 KnownBits KnownY = DAG.computeKnownBits(Y);
58064 if (KnownY.countMinLeadingZeros() < 12)
58065 return SDValue();
58066 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58067 if (KnownMul.countMinLeadingZeros() < 12)
58068 return SDValue();
58069
58070 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58071 ArrayRef<SDValue> SubOps) {
58072 EVT SubVT = SubOps[0].getValueType();
58073 assert(SubVT.getScalarSizeInBits() == 64 &&
58074 "Unexpected element size, only supports 64bit size");
58075 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58076 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58077 };
58078
58079 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58080 /*CheckBWI*/ false,
58081 /*AllowAVX512*/ Subtarget.hasIFMA());
58082}
58083
58086 const X86Subtarget &Subtarget) {
58087 using namespace SDPatternMatch;
58088 EVT VT = N->getValueType(0);
58089 SDValue Op0 = N->getOperand(0);
58090 SDValue Op1 = N->getOperand(1);
58091 SDLoc DL(N);
58092
58093 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58094 return Select;
58095
58096 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58097 return MAdd;
58098 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58099 return MAdd;
58100 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58101 return MAdd;
58102
58103 // Try to synthesize horizontal adds from adds of shuffles.
58104 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58105 return V;
58106
58107 // Canonicalize hidden LEA pattern:
58108 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58109 // iff c < 4
58110 if (VT == MVT::i32 || VT == MVT::i64) {
58111 SDValue Y, Z, Shift;
58112 APInt Amt;
58113 if (sd_match(
58115 m_Shl(m_Value(), m_ConstInt(Amt))),
58116 m_Value(Y))),
58117 m_Value(Z))) &&
58118 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58119 return DAG.getNode(ISD::SUB, DL, VT,
58120 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58121 }
58122 }
58123
58124 SDValue X, Y;
58125
58126 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58127 // iff X and Y won't overflow.
58128 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58130 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58131 MVT OpVT = X.getSimpleValueType();
58132 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58133 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58134 getZeroVector(OpVT, Subtarget, DAG, DL));
58135 }
58136
58137 if (VT.isVector()) {
58138 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58140
58141 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58142 // (sub Y, (sext (vXi1 X))).
58143 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58144 // in generic DAG combine without a legal type check, but adding this there
58145 // caused regressions.
58146 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58148 m_Value(Y)))) {
58149 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58150 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58151 }
58152
58153 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58154 // canonicalisation as we don't have good vXi8 shifts.
58155 if (VT.getScalarType() == MVT::i8 &&
58157 SDValue Cmp =
58158 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58159 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58160 }
58161 }
58162
58163 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58164 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58165 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58166 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58167 if (sd_match(N, m_Add(m_Value(Accum),
58170 m_Value(Lo1)),
58172 m_Value(Hi1)))))) {
58173 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58174 concatSubVectors(Lo0, Hi0, DAG, DL),
58175 concatSubVectors(Lo1, Hi1, DAG, DL));
58176 }
58177 }
58178
58179 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58180 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58181 X86::isZeroNode(Op0.getOperand(1))) {
58182 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58183 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58184 Op0.getOperand(0), Op0.getOperand(2));
58185 }
58186
58187 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58188 return IFMA52;
58189
58190 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58191}
58192
58193// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58194// condition comes from the subtract node that produced -X. This matches the
58195// cmov expansion for absolute value. By swapping the operands we convert abs
58196// to nabs.
58197static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58198 SelectionDAG &DAG) {
58199 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58200 return SDValue();
58201
58202 SDValue Cond = N1.getOperand(3);
58203 if (Cond.getOpcode() != X86ISD::SUB)
58204 return SDValue();
58205 assert(Cond.getResNo() == 1 && "Unexpected result number");
58206
58207 SDValue FalseOp = N1.getOperand(0);
58208 SDValue TrueOp = N1.getOperand(1);
58210
58211 // ABS condition should come from a negate operation.
58212 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58213 isNullConstant(Cond.getOperand(0))) {
58214 // Get the X and -X from the negate.
58215 SDValue NegX = Cond.getValue(0);
58216 SDValue X = Cond.getOperand(1);
58217
58218 // Cmov operands should be X and NegX. Order doesn't matter.
58219 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58220 return SDValue();
58221
58222 // Build a new CMOV with the operands swapped.
58223 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58224 N1.getOperand(2), Cond);
58225 // Convert sub to add.
58226 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58227 }
58228
58229 // Handle ABD special case:
58230 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58231 // ABD condition should come from a pair of matching subtracts.
58232 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58233 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58234 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58235 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58236 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58237 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58238 // Build a new CMOV with the operands swapped.
58239 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58240 Cond);
58241 }
58242
58243 return SDValue();
58244}
58245
58247 SDValue Op0 = N->getOperand(0);
58248 SDValue Op1 = N->getOperand(1);
58249
58250 // (sub C (zero_extend (setcc)))
58251 // =>
58252 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58253 // Don't disturb (sub 0 setcc), which is easily done with neg.
58254 EVT VT = N->getValueType(0);
58255 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58256 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58257 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58258 Op1.getOperand(0).hasOneUse()) {
58259 SDValue SetCC = Op1.getOperand(0);
58262 APInt NewImm = Op0C->getAPIntValue() - 1;
58263 SDLoc DL(Op1);
58264 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58265 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58266 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58267 DAG.getConstant(NewImm, DL, VT));
58268 }
58269
58270 return SDValue();
58271}
58272
58274 if (N->getConstantOperandVal(3) != X86::COND_NE)
58275 return SDValue();
58276
58277 SDValue Sub = N->getOperand(4);
58278 if (Sub.getOpcode() != X86ISD::SUB)
58279 return SDValue();
58280
58281 SDValue Op1 = Sub.getOperand(1);
58282
58283 if (!X86::isZeroNode(Sub.getOperand(0)))
58284 return SDValue();
58285
58286 SDLoc DL(N);
58287 SmallVector<SDValue, 5> Ops(N->op_values());
58288 if (Op1.getOpcode() == X86ISD::SETCC) {
58289 // res, flags2 = sub 0, (setcc cc, flag)
58290 // cload/cstore ..., cond_ne, flag2
58291 // ->
58292 // cload/cstore cc, flag
58293 Ops[3] = Op1.getOperand(0);
58294 Ops[4] = Op1.getOperand(1);
58295 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58296 SDValue Src = Op1;
58297 SDValue Op10 = Op1.getOperand(0);
58298 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58299 // res, flags2 = sub 0, (and (xor X, -1), Y)
58300 // cload/cstore ..., cond_ne, flag2
58301 // ->
58302 // res, flags2 = sub 0, (and X, Y)
58303 // cload/cstore ..., cond_e, flag2
58304 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58305 Op1.getOperand(1));
58306 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58307 }
58308 // res, flags2 = sub 0, (and X, Y)
58309 // cload/cstore ..., cc, flag2
58310 // ->
58311 // res, flags2 = cmp (and X, Y), 0
58312 // cload/cstore ..., cc, flag2
58313 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58314 } else {
58315 return SDValue();
58316 }
58317
58318 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58319 cast<MemSDNode>(N)->getMemoryVT(),
58320 cast<MemSDNode>(N)->getMemOperand());
58321}
58322
58325 const X86Subtarget &Subtarget) {
58326 EVT VT = N->getValueType(0);
58327 SDValue Op0 = N->getOperand(0);
58328 SDValue Op1 = N->getOperand(1);
58329 SDLoc DL(N);
58330
58331 auto IsNonOpaqueConstant = [&](SDValue Op) {
58333 /*AllowOpaques*/ false);
58334 };
58335
58336 // X86 can't encode an immediate LHS of a sub. See if we can push the
58337 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58338 // one use and a constant, invert the immediate, saving one register.
58339 // However, ignore cases where C1 is 0, as those will become a NEG.
58340 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58341 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58342 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58343 Op1->hasOneUse()) {
58344 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58345 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58346 SDValue NewAdd =
58347 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58348 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58349 }
58350
58351 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58352 return V;
58353
58354 // Try to synthesize horizontal subs from subs of shuffles.
58355 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58356 return V;
58357
58358 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58359 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58360 X86::isZeroNode(Op1.getOperand(1))) {
58361 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58362 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58363 Op1.getOperand(0), Op1.getOperand(2));
58364 }
58365
58366 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58367 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58368 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58369 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58370 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58371 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58372 Op1.getOperand(1), Op1.getOperand(2));
58373 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58374 }
58375
58376 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58377 return V;
58378
58379 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58380 return V;
58381
58382 return combineSubSetcc(N, DAG);
58383}
58384
58386 const X86Subtarget &Subtarget) {
58387 unsigned Opcode = N->getOpcode();
58388 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58389 "Unknown PCMP opcode");
58390
58391 SDValue LHS = N->getOperand(0);
58392 SDValue RHS = N->getOperand(1);
58393 MVT VT = N->getSimpleValueType(0);
58394 unsigned EltBits = VT.getScalarSizeInBits();
58395 unsigned NumElts = VT.getVectorNumElements();
58396 SDLoc DL(N);
58397
58398 if (LHS == RHS)
58399 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58400 : DAG.getConstant(0, DL, VT);
58401
58402 // Constant Folding.
58403 // PCMPEQ(X,UNDEF) -> UNDEF
58404 // PCMPGT(X,UNDEF) -> 0
58405 // PCMPGT(UNDEF,X) -> 0
58406 APInt LHSUndefs, RHSUndefs;
58407 SmallVector<APInt> LHSBits, RHSBits;
58408 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58409 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58410 APInt Ones = APInt::getAllOnes(EltBits);
58411 APInt Zero = APInt::getZero(EltBits);
58412 SmallVector<APInt> Results(NumElts);
58413 for (unsigned I = 0; I != NumElts; ++I) {
58414 if (Opcode == X86ISD::PCMPEQ) {
58415 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58416 } else {
58417 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58418 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58419 }
58420 }
58421 if (Opcode == X86ISD::PCMPEQ)
58422 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58423 return getConstVector(Results, VT, DAG, DL);
58424 }
58425
58426 return SDValue();
58427}
58428
58429// Helper to determine if we can convert an integer comparison to a float
58430// comparison byt casting the operands.
58431static std::optional<unsigned>
58432CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58433 unsigned NumSignificantBitsRHS) {
58434 MVT SVT = VT.getScalarType();
58435 assert(SVT == MVT::f32 && "Only tested for float so far");
58436 const fltSemantics &Sem = SVT.getFltSemantics();
58437 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58438 "Only PCMPEQ/PCMPGT currently supported");
58439
58440 // TODO: Handle bitcastable integers.
58441
58442 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58443 // a fp value.
58444 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58445 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58446 return ISD::SINT_TO_FP;
58447
58448 return std::nullopt;
58449}
58450
58451/// Helper that combines an array of subvector ops as if they were the operands
58452/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58453/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58456 const X86Subtarget &Subtarget,
58457 unsigned Depth) {
58458 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58459 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58460
58461 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58462 return DAG.getUNDEF(VT);
58463
58464 if (llvm::all_of(Ops, [](SDValue Op) {
58465 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58466 }))
58467 return getZeroVector(VT, Subtarget, DAG, DL);
58468
58470 return SDValue(); // Limit search depth.
58471
58472 SDValue Op0 = Ops[0];
58473 bool IsSplat = llvm::all_equal(Ops);
58474 unsigned NumOps = Ops.size();
58475 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58476 LLVMContext &Ctx = *DAG.getContext();
58477
58478 // Repeated subvectors.
58479 if (IsSplat &&
58480 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58481 // If this broadcast is inserted into both halves, use a larger broadcast.
58482 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58483 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58484
58485 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58486 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58487 (Subtarget.hasAVX2() ||
58489 VT.getScalarType(), Subtarget)))
58490 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58491 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58492 Op0.getOperand(0),
58493 DAG.getVectorIdxConstant(0, DL)));
58494
58495 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58496 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58497 (Subtarget.hasAVX2() ||
58498 (EltSizeInBits >= 32 &&
58499 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58500 Op0.getOperand(0).getValueType() == VT.getScalarType())
58501 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58502
58503 // concat_vectors(extract_subvector(splat(x)),
58504 // extract_subvector(splat(x))) -> splat(x)
58505 // concat_vectors(extract_subvector(subv_broadcast(x)),
58506 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58507 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58508 Op0.getOperand(0).getValueType() == VT) {
58509 SDValue SrcVec = Op0.getOperand(0);
58510 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58511 return SrcVec;
58512 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58513 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58514 return SrcVec;
58515 }
58516
58517 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58518 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58519 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58520 return DAG.getNode(Op0.getOpcode(), DL, VT,
58522 Op0.getOperand(0), Op0.getOperand(0)),
58523 Op0.getOperand(1));
58524 }
58525
58526 // TODO: This should go in combineX86ShufflesRecursively eventually.
58527 if (NumOps == 2) {
58528 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58529 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58530 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58532 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58533 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58534 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58535 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58536 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58537 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58538 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58539 // Only concat of subvector high halves which vperm2x128 is best at or if
58540 // it should fold into a subvector broadcast.
58541 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58542 SrcVT1.is256BitVector()) {
58543 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58544 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58545 "Bad subvector index");
58546 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58547 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58548 unsigned Index = 0;
58549 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58550 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58551 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58552 DAG.getBitcast(VT, Src0.getOperand(0)),
58553 DAG.getBitcast(VT, Src1.getOperand(0)),
58554 DAG.getTargetConstant(Index, DL, MVT::i8));
58555 }
58556 }
58557 // Widen extract_subvector
58558 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58559 // --> extract_subvector(x,lo)
58560 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58561 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58562 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58563 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58564 return DAG.getBitcast(VT,
58566 Src0.getConstantOperandVal(1),
58567 DAG, DL, VT.getSizeInBits()));
58568 }
58569 }
58570 }
58571
58572 // Repeated opcode.
58573 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58574 // but it currently struggles with different vector widths.
58575 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58576 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58577 })) {
58578 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58580 for (SDValue SubOp : SubOps)
58581 Subs.push_back(SubOp.getOperand(I));
58582 // Attempt to peek through bitcasts and concat the original subvectors.
58583 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58584 if (SubVT.isSimple() && SubVT.isVector()) {
58585 MVT ConcatVT =
58587 SubVT.getVectorElementCount() * Subs.size());
58588 for (SDValue &Sub : Subs)
58589 Sub = DAG.getBitcast(SubVT, Sub);
58590 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58591 Subtarget, Depth + 1))
58592 return DAG.getBitcast(VT, ConcatSrc);
58593 return DAG.getBitcast(
58594 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58595 }
58596 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58597 };
58598 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58599 bool AllConstants = true;
58600 bool AllSubs = true;
58601 unsigned VecSize = VT.getSizeInBits();
58602 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58603 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58604 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58605 }))
58606 return true;
58607 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58608 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58609 unsigned SubSize = BC.getValueSizeInBits();
58610 unsigned EltSize = BC.getScalarValueSizeInBits();
58611 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58613 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58614 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58615 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58616 }
58617 return AllConstants || AllSubs;
58618 };
58619 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58620 bool AllConstants = true;
58622 for (SDValue SubOp : SubOps) {
58623 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58624 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58626 Subs.push_back(SubOp.getOperand(I));
58627 }
58628 if (AllConstants)
58629 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58630 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58631 };
58632
58633 unsigned Opcode = Op0.getOpcode();
58634 switch (Opcode) {
58635 case ISD::BITCAST: {
58636 // TODO: Support AVX1/AVX2 bitcasts.
58638 for (SDValue SubOp : Ops)
58639 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58640 EVT InnerVT = SubOps[0].getValueType();
58641 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58642 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58643 (Subtarget.hasBWI() ||
58644 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58645 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58646 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58647 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58648 return Op.getValueType() == InnerVT;
58649 })) {
58650 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58651 MVT ConcatVT = MVT::getVectorVT(
58652 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58653 if (SDValue ConcatSrc = combineConcatVectorOps(
58654 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58655 return DAG.getBitcast(VT, ConcatSrc);
58656 }
58657 break;
58658 }
58659 case ISD::VECTOR_SHUFFLE: {
58660 // TODO: Generalize NumOps support.
58661 if (!IsSplat && NumOps == 2 &&
58662 ((VT.is256BitVector() &&
58663 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58664 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58665 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58666 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58667 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58668 if (Concat0 || Concat1 ||
58669 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58670 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58671 Subtarget.hasVBMI())) {
58672 int NumSubElts = Op0.getValueType().getVectorNumElements();
58673 SmallVector<int> NewMask;
58674 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58675 M = M >= NumSubElts ? M + NumSubElts : M;
58676 NewMask.push_back(M);
58677 }
58678 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58679 if (0 <= M)
58680 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58681 NewMask.push_back(M);
58682 }
58683 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58684 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58685 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58686 }
58687 }
58688 break;
58689 }
58690 case X86ISD::VBROADCAST: {
58691 // TODO: 512-bit VBROADCAST concatenation.
58692 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58693 return Op.getOperand(0).getValueType().is128BitVector();
58694 })) {
58695 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58696 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58697 ConcatSubOperand(VT, Ops, 0),
58698 ConcatSubOperand(VT, Ops, 0));
58699 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58700 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58701 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58703 DL, VT, ConcatSubOperand(VT, Ops, 0),
58704 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58705 }
58706 break;
58707 }
58708 case X86ISD::MOVDDUP:
58709 case X86ISD::MOVSHDUP:
58710 case X86ISD::MOVSLDUP: {
58711 if (!IsSplat && (VT.is256BitVector() ||
58712 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58713 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58714 break;
58715 }
58716 case X86ISD::SHUFP: {
58717 if (!IsSplat &&
58718 (VT == MVT::v8f32 ||
58719 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58720 llvm::all_of(Ops, [Op0](SDValue Op) {
58721 return Op.getOperand(2) == Op0.getOperand(2);
58722 })) {
58723 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58724 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58725 if (Concat0 || Concat1)
58726 return DAG.getNode(Opcode, DL, VT,
58727 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58728 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58729 Op0.getOperand(2));
58730 }
58731 break;
58732 }
58733 case X86ISD::UNPCKH:
58734 case X86ISD::UNPCKL: {
58735 // TODO: UNPCK should use CombineSubOperand
58736 // Don't concatenate build_vector patterns.
58737 if (!IsSplat &&
58738 ((VT.is256BitVector() &&
58739 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58740 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58741 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58742 none_of(Ops, [](SDValue Op) {
58743 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58745 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58747 })) {
58748 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58749 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58750 if (Concat0 || Concat1 ||
58751 (Subtarget.hasInt256() && EltSizeInBits == 64))
58752 return DAG.getNode(Opcode, DL, VT,
58753 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58754 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58755 }
58756 break;
58757 }
58758 case X86ISD::PSHUFHW:
58759 case X86ISD::PSHUFLW:
58760 case X86ISD::PSHUFD:
58761 if (!IsSplat &&
58762 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58763 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58764 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58765 llvm::all_of(Ops, [Op0](SDValue Op) {
58766 return Op.getOperand(1) == Op0.getOperand(1);
58767 })) {
58768 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58769 Op0.getOperand(1));
58770 }
58771 [[fallthrough]];
58772 case X86ISD::VPERMILPI:
58773 if (!IsSplat && EltSizeInBits == 32 &&
58774 (VT.is256BitVector() ||
58775 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58776 all_of(Ops, [&Op0](SDValue Op) {
58777 return Op0.getOperand(1) == Op.getOperand(1);
58778 })) {
58779 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58780 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58781 Res =
58782 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58783 return DAG.getBitcast(VT, Res);
58784 }
58785 break;
58786 case X86ISD::VPERMILPV:
58787 if (!IsSplat && (VT.is256BitVector() ||
58788 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58789 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58790 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58791 if (Concat0 || Concat1)
58792 return DAG.getNode(Opcode, DL, VT,
58793 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58794 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58795 }
58796 break;
58797 case X86ISD::PSHUFB:
58798 case X86ISD::PSADBW:
58799 case X86ISD::VPMADDUBSW:
58800 case X86ISD::VPMADDWD:
58801 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58802 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58803 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58804 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58805 NumOps * SrcVT.getVectorNumElements());
58806 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58807 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58808 if (Concat0 || Concat1)
58809 return DAG.getNode(
58810 Opcode, DL, VT,
58811 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58812 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58813 }
58814 break;
58815 case X86ISD::VPERMV:
58816 // TODO: Handle 256-bit and NumOps == 4 cases.
58817 if (!IsSplat && NumOps == 2 &&
58818 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58819 MVT OpVT = Op0.getSimpleValueType();
58820 int NumSrcElts = OpVT.getVectorNumElements();
58821 SmallVector<int, 64> ConcatMask;
58822 for (unsigned i = 0; i != NumOps; ++i) {
58823 SmallVector<int, 64> SubMask;
58825 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58826 break;
58827 for (int M : SubMask) {
58828 if (0 <= M)
58829 M += i * NumSrcElts;
58830 ConcatMask.push_back(M);
58831 }
58832 }
58833 if (ConcatMask.size() == (NumOps * NumSrcElts))
58834 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58835 ConcatSubOperand(VT, Ops, 1),
58836 DAG.getUNDEF(VT), Subtarget, DAG);
58837 }
58838 break;
58839 case X86ISD::VPERMV3:
58840 // TODO: Handle 256-bit and NumOps == 4 cases.
58841 if (!IsSplat && NumOps == 2 &&
58842 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58843 MVT OpVT = Op0.getSimpleValueType();
58844 int NumSrcElts = OpVT.getVectorNumElements();
58845 SmallVector<int, 64> ConcatMask;
58846 for (unsigned i = 0; i != NumOps; ++i) {
58847 SmallVector<int, 64> SubMask;
58849 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58850 break;
58851 for (int M : SubMask) {
58852 if (0 <= M) {
58853 int Src = M < NumSrcElts ? 0 : 2;
58854 M += M < NumSrcElts ? 0 : NumSrcElts;
58855
58856 // Reference the lowest sub if the upper sub is the same.
58857 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58858 M += i * NumSrcElts;
58859 }
58860 ConcatMask.push_back(M);
58861 }
58862 }
58863 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58864 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58865 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58866 if (Concat0 || Concat1)
58867 return lowerShuffleWithPERMV(
58868 DL, VT, ConcatMask,
58869 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58870 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58871 DAG);
58872 }
58873 }
58874 break;
58875 case X86ISD::VPERM2X128: {
58876 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58877 assert(NumOps == 2 && "Bad concat_vectors operands");
58878 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58879 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58880 // TODO: Handle zero'd subvectors.
58881 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58882 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58883 (int)((Imm1 >> 4) & 0x3)};
58884 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58885 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58886 Ops[0].getOperand(1), DAG, DL);
58887 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58888 Ops[1].getOperand(1), DAG, DL);
58889 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58890 DAG.getBitcast(ShuffleVT, LHS),
58891 DAG.getBitcast(ShuffleVT, RHS),
58892 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58893 return DAG.getBitcast(VT, Res);
58894 }
58895 }
58896 break;
58897 }
58898 case X86ISD::SHUF128: {
58899 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58900 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58901 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58902 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58903 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58904 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58905 Ops[0].getOperand(1), DAG, DL);
58906 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58907 Ops[1].getOperand(1), DAG, DL);
58908 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58909 DAG.getTargetConstant(Imm, DL, MVT::i8));
58910 }
58911 break;
58912 }
58913 case ISD::TRUNCATE:
58914 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58915 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58916 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58917 SrcVT == Ops[1].getOperand(0).getValueType() &&
58918 Subtarget.useAVX512Regs() &&
58919 Subtarget.getPreferVectorWidth() >= 512 &&
58920 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58921 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58922 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58923 ConcatSubOperand(NewSrcVT, Ops, 0));
58924 }
58925 }
58926 break;
58927 case ISD::ANY_EXTEND:
58928 case ISD::SIGN_EXTEND:
58929 case ISD::ZERO_EXTEND:
58930 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58931 if (!IsSplat && NumOps == 2 &&
58932 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58933 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58934 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58935 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58936 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58937 SrcVT == Ops[1].getOperand(0).getValueType()) {
58938 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58939 return DAG.getNode(Opcode, DL, VT,
58940 ConcatSubOperand(NewSrcVT, Ops, 0));
58941 }
58942 }
58943 break;
58947 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58948 if (!IsSplat && NumOps == 2 &&
58949 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58950 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58951 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58953 Op0.getOperand(0).getValueType() ==
58954 Ops[0].getOperand(0).getValueType()) {
58955 EVT SrcVT = Op0.getOperand(0).getValueType();
58956 unsigned NumElts = VT.getVectorNumElements();
58957 MVT UnpackSVT =
58958 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58959 MVT UnpackVT =
58960 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58961 SDValue Unpack =
58962 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58963 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58964 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58965 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58966 DAG.getBitcast(SrcVT, Unpack), DAG);
58967 }
58968 break;
58969 }
58970 case X86ISD::VSHLI:
58971 case X86ISD::VSRLI:
58972 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58973 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58974 llvm::all_of(Ops, [](SDValue Op) {
58975 return Op.getConstantOperandAPInt(1) == 32;
58976 })) {
58977 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58978 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58979 Res = DAG.getBitcast(MVT::v8i32, Res);
58980 if (Opcode == X86ISD::VSHLI) {
58981 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58982 {8, 0, 8, 2, 8, 4, 8, 6});
58983 } else {
58984 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58985 {1, 8, 3, 8, 5, 8, 7, 8});
58986 }
58987 return DAG.getBitcast(VT, Res);
58988 }
58989 }
58990 [[fallthrough]];
58991 case X86ISD::VSRAI:
58992 case X86ISD::VSHL:
58993 case X86ISD::VSRL:
58994 case X86ISD::VSRA:
58995 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58996 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58997 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58998 llvm::all_of(Ops, [Op0](SDValue Op) {
58999 return Op0.getOperand(1) == Op.getOperand(1);
59000 })) {
59001 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59002 Op0.getOperand(1));
59003 }
59004 break;
59005 case X86ISD::VPERMI:
59006 case X86ISD::VROTLI:
59007 case X86ISD::VROTRI:
59008 if (!IsSplat &&
59009 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59010 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59011 llvm::all_of(Ops, [Op0](SDValue Op) {
59012 return Op0.getOperand(1) == Op.getOperand(1);
59013 })) {
59014 assert(!(Opcode == X86ISD::VPERMI &&
59015 Op0.getValueType().is128BitVector()) &&
59016 "Illegal 128-bit X86ISD::VPERMI nodes");
59017 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59018 Op0.getOperand(1));
59019 }
59020 break;
59021 case ISD::AND:
59022 case ISD::OR:
59023 case ISD::XOR:
59024 case X86ISD::ANDNP:
59025 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59026 if (!IsSplat && (VT.is256BitVector() ||
59027 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59028 // Don't concatenate root AVX1 NOT patterns.
59029 // TODO: Allow NOT folding if Concat0 succeeds.
59030 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59031 llvm::all_of(Ops, [](SDValue X) {
59032 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59033 }))
59034 break;
59035 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59036 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59037 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59038 return DAG.getNode(Opcode, DL, VT,
59039 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59040 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59041 }
59042 break;
59043 case X86ISD::PCMPEQ:
59044 case X86ISD::PCMPGT:
59045 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59046 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59047 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59048 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59049 if (Concat0 || Concat1)
59050 return DAG.getNode(Opcode, DL, VT,
59051 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59052 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59053 break;
59054 }
59055
59056 if (!IsSplat && VT == MVT::v8i32) {
59057 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59058 // TODO: Handle v4f64 as well?
59059 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59060 for (unsigned I = 0; I != NumOps; ++I) {
59061 MaxSigBitsLHS =
59062 std::max(MaxSigBitsLHS,
59063 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59064 MaxSigBitsRHS =
59065 std::max(MaxSigBitsRHS,
59066 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59067 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59068 break;
59069 }
59070
59071 ISD::CondCode ICC =
59072 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59073 ISD::CondCode FCC =
59075
59076 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59077 MVT FpVT = VT.changeVectorElementType(FpSVT);
59078
59079 if (std::optional<unsigned> CastOpc =
59080 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59081 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59082 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59083 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59084 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59085 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59086 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59087
59088 bool IsAlwaysSignaling;
59089 unsigned FSETCC =
59090 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59091 return DAG.getBitcast(
59092 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59093 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59094 }
59095 }
59096 break;
59097 case ISD::CTPOP:
59098 case ISD::CTTZ:
59099 case ISD::CTLZ:
59102 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59103 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59104 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59105 }
59106 break;
59108 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59109 if (!IsSplat &&
59110 (VT.is256BitVector() ||
59111 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59112 llvm::all_of(Ops, [Op0](SDValue Op) {
59113 return Op0.getOperand(2) == Op.getOperand(2);
59114 })) {
59115 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59116 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59117 }
59118 break;
59119 case ISD::ADD:
59120 case ISD::SUB:
59121 case ISD::MUL:
59122 // TODO: Add more integer binops?
59123 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59124 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59125 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59126 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59127 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59128 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59129 return Op.getOperand(0) == Op.getOperand(1);
59130 }))
59131 return DAG.getNode(Opcode, DL, VT,
59132 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59133 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59134 }
59135 break;
59136 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59137 // their latency are short, so here we don't replace them unless we won't
59138 // introduce extra VINSERT.
59139 case ISD::FADD:
59140 case ISD::FSUB:
59141 case ISD::FMUL:
59142 if (!IsSplat && (VT.is256BitVector() ||
59143 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59144 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59145 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59146 if (Concat0 || Concat1)
59147 return DAG.getNode(Opcode, DL, VT,
59148 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59149 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59150 }
59151 break;
59152 // Always prefer to concatenate high latency FDIV instructions.
59153 case ISD::FDIV:
59154 if (!IsSplat && (VT.is256BitVector() ||
59155 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59156 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59157 ConcatSubOperand(VT, Ops, 1));
59158 }
59159 break;
59160 case X86ISD::HADD:
59161 case X86ISD::HSUB:
59162 case X86ISD::FHADD:
59163 case X86ISD::FHSUB:
59164 if (!IsSplat && VT.is256BitVector() &&
59165 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59166 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59167 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59168 if (Concat0 || Concat1)
59169 return DAG.getNode(Opcode, DL, VT,
59170 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59171 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59172 }
59173 break;
59174 case X86ISD::PACKSS:
59175 case X86ISD::PACKUS:
59176 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59177 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59178 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59179 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59180 NumOps * SrcVT.getVectorNumElements());
59181 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59182 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59183 if (Concat0 || Concat1)
59184 return DAG.getNode(
59185 Opcode, DL, VT,
59186 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59187 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59188 }
59189 break;
59190 case X86ISD::VSHLD:
59191 case X86ISD::VSHRD:
59192 case X86ISD::PALIGNR:
59193 if (!IsSplat &&
59194 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59195 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59196 llvm::all_of(Ops, [Op0](SDValue Op) {
59197 return Op0.getOperand(2) == Op.getOperand(2);
59198 })) {
59199 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59200 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59201 if (Concat0 || Concat1)
59202 return DAG.getNode(Opcode, DL, VT,
59203 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59204 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59205 Op0.getOperand(2));
59206 }
59207 break;
59208 case X86ISD::BLENDI:
59209 if (VT.is256BitVector() && NumOps == 2 &&
59210 (EltSizeInBits >= 32 ||
59211 (Subtarget.hasInt256() &&
59212 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59213 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59214 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59215 if (Concat0 || Concat1) {
59216 unsigned NumElts = VT.getVectorNumElements();
59217 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59218 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59219 Mask = Mask.zextOrTrunc(8);
59220 return DAG.getNode(Opcode, DL, VT,
59221 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59222 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59223 DAG.getTargetConstant(Mask, DL, MVT::i8));
59224 }
59225 }
59226 // TODO: BWI targets should only use CombineSubOperand.
59227 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59228 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59229 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59230 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59231 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59232 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59233 unsigned NumElts = VT.getVectorNumElements();
59234 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59235 for (unsigned I = 1; I != NumOps; ++I)
59236 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59237 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59238 Mask = Mask.zextOrTrunc(NumMaskBits);
59239 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59240 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59241 SDValue Sel =
59242 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59243 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59244 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59245 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59246 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59247 }
59248 }
59249 break;
59250 case ISD::VSELECT:
59251 // TODO: VSELECT should use CombineSubOperand.
59252 if (!IsSplat && Subtarget.hasAVX512() &&
59253 (VT.is256BitVector() ||
59254 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59255 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59256 EVT SelVT = Ops[0].getOperand(0).getValueType();
59257 if (SelVT.getVectorElementType() == MVT::i1) {
59258 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59259 NumOps * SelVT.getVectorNumElements());
59260 if (TLI.isTypeLegal(SelVT))
59261 return DAG.getNode(
59262 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59263 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59264 }
59265 }
59266 [[fallthrough]];
59267 case X86ISD::BLENDV:
59268 // TODO: BLENDV should use CombineSubOperand.
59269 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59270 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59271 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59272 EVT SelVT = Ops[0].getOperand(0).getValueType();
59273 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59274 if (TLI.isTypeLegal(SelVT))
59275 return DAG.getNode(
59276 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59277 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59278 }
59279 break;
59280 }
59281 }
59282
59283 // Fold subvector loads into one.
59284 // If needed, look through bitcasts to get to the load.
59285 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59286 unsigned Fast;
59287 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59288 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59289 *FirstLd->getMemOperand(), &Fast) &&
59290 Fast) {
59291 if (SDValue Ld =
59292 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59293 return Ld;
59294 }
59295 }
59296
59297 // Attempt to fold target constant loads.
59298 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59299 SmallVector<APInt> EltBits;
59300 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59301 for (unsigned I = 0; I != NumOps; ++I) {
59302 APInt OpUndefElts;
59303 SmallVector<APInt> OpEltBits;
59304 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59305 OpEltBits, /*AllowWholeUndefs*/ true,
59306 /*AllowPartialUndefs*/ false))
59307 break;
59308 EltBits.append(OpEltBits);
59309 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59310 }
59311 if (EltBits.size() == VT.getVectorNumElements()) {
59312 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59313 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59314 SDValue CV = DAG.getConstantPool(C, PVT);
59317 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59318 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59320 return Ld;
59321 }
59322 }
59323
59324 // If this simple subvector or scalar/subvector broadcast_load is inserted
59325 // into both halves, use a larger broadcast_load. Update other uses to use
59326 // an extracted subvector.
59327 if (IsSplat &&
59328 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59329 if (ISD::isNormalLoad(Op0.getNode()) ||
59332 auto *Mem = cast<MemSDNode>(Op0);
59333 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59336 if (SDValue BcastLd =
59337 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59338 SDValue BcastSrc =
59339 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59340 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59341 return BcastLd;
59342 }
59343 }
59344 }
59345
59346 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59347 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59348 Subtarget.useAVX512Regs()) {
59349 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59350 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59351 Res = DAG.getBitcast(ShuffleVT, Res);
59352 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59353 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59354 return DAG.getBitcast(VT, Res);
59355 }
59356
59357 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59358 if (!IsSplat &&
59359 ((NumOps == 2 && VT == MVT::v4f64) ||
59360 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59361 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59362 // Collect the individual per-lane v2f64/v4f64 shuffles.
59363 MVT OpVT = Ops[0].getSimpleValueType();
59364 unsigned NumOpElts = OpVT.getVectorNumElements();
59367 if (all_of(seq<int>(NumOps), [&](int I) {
59368 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59369 Depth + 1) &&
59370 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59371 none_of(SrcMasks[I], isUndefOrZero) &&
59372 SrcMasks[I].size() == NumOpElts &&
59373 all_of(SrcOps[I], [&OpVT](SDValue V) {
59374 return V.getValueType() == OpVT;
59375 });
59376 })) {
59377 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59378 bool Unary = true;
59379 unsigned SHUFPDMask = 0;
59381 for (unsigned I = 0; I != NumOps; ++I) {
59382 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59383 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59384 Unary &= LHS[I] == RHS[I];
59385 for (unsigned J = 0; J != NumOpElts; ++J)
59386 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59387 }
59388 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59389 // PERMILPD mask and we can always profitably concatenate them.
59390 SDValue Concat0 =
59391 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59392 SDValue Concat1 =
59393 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59394 if (Unary || Concat0 || Concat1) {
59395 Concat0 =
59396 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59397 Concat1 =
59398 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59399 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59400 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59401 }
59402 }
59403 }
59404
59405 return SDValue();
59406}
59407
59410 const X86Subtarget &Subtarget) {
59411 EVT VT = N->getValueType(0);
59412 EVT SrcVT = N->getOperand(0).getValueType();
59413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59415
59416 if (VT.getVectorElementType() == MVT::i1) {
59417 // Attempt to constant fold.
59418 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59420 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59422 if (!C) break;
59423 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59424 if (I == (E - 1)) {
59425 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59426 if (TLI.isTypeLegal(IntVT))
59427 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59428 }
59429 }
59430
59431 // Don't do anything else for i1 vectors.
59432 return SDValue();
59433 }
59434
59435 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59436 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59437 Subtarget))
59438 return R;
59439 }
59440
59441 return SDValue();
59442}
59443
59446 const X86Subtarget &Subtarget) {
59447 if (DCI.isBeforeLegalizeOps())
59448 return SDValue();
59449
59450 MVT OpVT = N->getSimpleValueType(0);
59451
59452 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59453
59454 SDLoc dl(N);
59455 SDValue Vec = N->getOperand(0);
59456 SDValue SubVec = N->getOperand(1);
59457
59458 uint64_t IdxVal = N->getConstantOperandVal(2);
59459 MVT SubVecVT = SubVec.getSimpleValueType();
59460 int VecNumElts = OpVT.getVectorNumElements();
59461 int SubVecNumElts = SubVecVT.getVectorNumElements();
59462
59463 if (Vec.isUndef() && SubVec.isUndef())
59464 return DAG.getUNDEF(OpVT);
59465
59466 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59467 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59468 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59469 return getZeroVector(OpVT, Subtarget, DAG, dl);
59470
59472 // If we're inserting into a zero vector and then into a larger zero vector,
59473 // just insert into the larger zero vector directly.
59474 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59476 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59477 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59478 getZeroVector(OpVT, Subtarget, DAG, dl),
59479 SubVec.getOperand(1),
59480 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59481 }
59482
59483 // If we're inserting into a zero vector and our input was extracted from an
59484 // insert into a zero vector of the same type and the extraction was at
59485 // least as large as the original insertion. Just insert the original
59486 // subvector into a zero vector.
59487 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59488 isNullConstant(SubVec.getOperand(1)) &&
59490 SDValue Ins = SubVec.getOperand(0);
59491 if (isNullConstant(Ins.getOperand(2)) &&
59492 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59493 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59494 SubVecVT.getFixedSizeInBits())
59495 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59496 getZeroVector(OpVT, Subtarget, DAG, dl),
59497 Ins.getOperand(1), N->getOperand(2));
59498 }
59499 }
59500
59501 // Stop here if this is an i1 vector.
59502 if (IsI1Vector)
59503 return SDValue();
59504
59505 // Eliminate an intermediate vector widening:
59506 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59507 // insert_subvector X, Y, Idx
59508 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59509 // there?
59510 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59511 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59512 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59513 SubVec.getOperand(1), N->getOperand(2));
59514
59515 // If this is an insert of an extract, combine to a shuffle. Don't do this
59516 // if the insert or extract can be represented with a subregister operation.
59517 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59518 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59519 (IdxVal != 0 ||
59520 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59521 SDValue ExtSrc = SubVec.getOperand(0);
59522 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59523 // Create a shuffle mask matching the extraction and insertion.
59524 SmallVector<int, 64> Mask(VecNumElts);
59525 std::iota(Mask.begin(), Mask.end(), 0);
59526 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59527 ExtIdxVal + VecNumElts);
59528 if (ExtIdxVal != 0)
59529 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59530 // See if we can use a blend instead of extract/insert pair.
59531 SmallVector<int, 64> BlendMask(VecNumElts);
59532 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59533 std::iota(BlendMask.begin() + IdxVal,
59534 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59535 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59536 VecNumElts == (2 * SubVecNumElts)) {
59537 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59538 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59539 SDValue Blend = DAG.getNode(
59540 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59541 DAG.getBitcast(MVT::v8f32, ExtSrc),
59542 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59543 return DAG.getBitcast(OpVT, Blend);
59544 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59545 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59546 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59547 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59548 SDValue Shuffle =
59549 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59550 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59551 return DAG.getBitcast(OpVT, Shuffle);
59552 }
59553 }
59554 }
59555
59556 // Match concat_vector style patterns.
59557 SmallVector<SDValue, 2> SubVectorOps;
59558 if (collectConcatOps(N, SubVectorOps, DAG)) {
59559 if (SDValue Fold =
59560 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59561 return Fold;
59562
59563 // If we're inserting all zeros into the upper half, change this to
59564 // a concat with zero. We will match this to a move
59565 // with implicit upper bit zeroing during isel.
59566 // We do this here because we don't want combineConcatVectorOps to
59567 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59568 if (SubVectorOps.size() == 2 &&
59569 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59570 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59571 getZeroVector(OpVT, Subtarget, DAG, dl),
59572 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59573
59574 // Attempt to recursively combine to a shuffle.
59575 if (all_of(SubVectorOps, [](SDValue SubOp) {
59577 })) {
59578 SDValue Op(N, 0);
59579 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59580 return Res;
59581 }
59582 }
59583
59584 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59585 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59586 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59587
59588 // If this is a broadcast load inserted into an upper undef, use a larger
59589 // broadcast load.
59590 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59591 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59592 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59594 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59595 }
59596
59597 // If we're splatting the lower half subvector of a full vector load into the
59598 // upper half, attempt to create a subvector broadcast.
59599 if ((int)IdxVal == (VecNumElts / 2) &&
59600 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59601 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59602 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59603 if (VecLd && SubLd &&
59605 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59607 SubVecVT, SubLd, 0, DAG);
59608 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59609 BcastLd, DAG.getVectorIdxConstant(0, dl));
59610 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59611 return BcastLd;
59612 }
59613 }
59614
59615 // Attempt to constant fold (if we're not widening).
59616 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59617 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59618 APInt VecUndefElts, SubUndefElts;
59619 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59620 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59621 VecEltBits) &&
59622 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59623 SubEltBits)) {
59624 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59625 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59626 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59627 }
59628 }
59629
59630 // Attempt to recursively combine to a shuffle.
59633 SDValue Op(N, 0);
59634 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59635 return Res;
59636 }
59637
59638 // Match insertion of subvector load that perfectly aliases a base load.
59639 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59640 ISD::isNormalLoad(SubVec.getNode()) &&
59642 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59643 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59644 return Vec;
59645
59646 return SDValue();
59647}
59648
59649/// If we are extracting a subvector of a vector select and the select condition
59650/// is composed of concatenated vectors, try to narrow the select width. This
59651/// is a common pattern for AVX1 integer code because 256-bit selects may be
59652/// legal, but there is almost no integer math/logic available for 256-bit.
59653/// This function should only be called with legal types (otherwise, the calls
59654/// to get simple value types will assert).
59656 SelectionDAG &DAG) {
59657 SDValue Sel = Ext->getOperand(0);
59658 if (Sel.getOpcode() != ISD::VSELECT ||
59659 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59660 return SDValue();
59661
59662 // Note: We assume simple value types because this should only be called with
59663 // legal operations/types.
59664 // TODO: This can be extended to handle extraction to 256-bits.
59665 MVT VT = Ext->getSimpleValueType(0);
59666 if (!VT.is128BitVector())
59667 return SDValue();
59668
59669 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59670 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59671 return SDValue();
59672
59673 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59674 MVT SelVT = Sel.getSimpleValueType();
59675 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59676 "Unexpected vector type with legal operations");
59677
59678 unsigned SelElts = SelVT.getVectorNumElements();
59679 unsigned CastedElts = WideVT.getVectorNumElements();
59680 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59681 if (SelElts % CastedElts == 0) {
59682 // The select has the same or more (narrower) elements than the extract
59683 // operand. The extraction index gets scaled by that factor.
59684 ExtIdx *= (SelElts / CastedElts);
59685 } else if (CastedElts % SelElts == 0) {
59686 // The select has less (wider) elements than the extract operand. Make sure
59687 // that the extraction index can be divided evenly.
59688 unsigned IndexDivisor = CastedElts / SelElts;
59689 if (ExtIdx % IndexDivisor != 0)
59690 return SDValue();
59691 ExtIdx /= IndexDivisor;
59692 } else {
59693 llvm_unreachable("Element count of simple vector types are not divisible?");
59694 }
59695
59696 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59697 unsigned NarrowElts = SelElts / NarrowingFactor;
59698 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59699 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59700 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59701 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59702 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59703 return DAG.getBitcast(VT, NarrowSel);
59704}
59705
59708 const X86Subtarget &Subtarget) {
59709 if (!N->getValueType(0).isSimple())
59710 return SDValue();
59711
59712 MVT VT = N->getSimpleValueType(0);
59713 SDValue InVec = N->getOperand(0);
59714 unsigned IdxVal = N->getConstantOperandVal(1);
59715 EVT InVecVT = InVec.getValueType();
59716 unsigned SizeInBits = VT.getSizeInBits();
59717 unsigned InSizeInBits = InVecVT.getSizeInBits();
59718 unsigned NumSubElts = VT.getVectorNumElements();
59719 unsigned NumInElts = InVecVT.getVectorNumElements();
59720 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59721 SDLoc DL(N);
59722
59723 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59724 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59725 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59726 // We let generic combining take over from there to simplify the
59727 // insert/extract and 'not'.
59728 // This pattern emerges during AVX1 legalization. We handle it before lowering
59729 // to avoid complications like splitting constant vector loads.
59730 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59731 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59732 auto isConcatenatedNot = [](SDValue V) {
59733 V = peekThroughBitcasts(V);
59734 if (!isBitwiseNot(V))
59735 return false;
59736 SDValue NotOp = V->getOperand(0);
59738 };
59739 if (isConcatenatedNot(InVec.getOperand(0)) ||
59740 isConcatenatedNot(InVec.getOperand(1))) {
59741 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59742 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59743 splitVectorIntBinary(InVec, DAG, DL),
59744 N->getOperand(1));
59745 }
59746 }
59747
59748 if (DCI.isBeforeLegalizeOps())
59749 return SDValue();
59750
59751 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59752 return V;
59753
59755 return getZeroVector(VT, Subtarget, DAG, DL);
59756
59757 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59758 if (VT.getScalarType() == MVT::i1)
59759 return DAG.getConstant(1, DL, VT);
59760 return getOnesVector(VT, DAG, DL);
59761 }
59762
59763 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59764 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59765
59766 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59767 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59768 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59769 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59770 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59771 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59772 }
59773
59774 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59775 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59776 // iff SUB is entirely contained in the extraction.
59777 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59778 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59779 SDValue Src = InVec.getOperand(0);
59780 SDValue Sub = InVec.getOperand(1);
59781 EVT SubVT = Sub.getValueType();
59782 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59783 if (IdxVal <= InsIdx &&
59784 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59785 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59786 DAG.getVectorIdxConstant(IdxVal, DL));
59787 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59788 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59789 }
59790 }
59791
59792 // If we're extracting an upper subvector see if we'd get the same elements if
59793 // we extracted the lowest subvector instead which should allow
59794 // SimplifyDemandedVectorElts do more simplifications.
59795 if (IdxVal != 0) {
59796 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59797 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59798 });
59799 if (AllEquiv)
59800 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59801 }
59802
59803 // Check if we're extracting a whole broadcasted subvector.
59804 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59805 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59806 EVT MemVT = MemIntr->getMemoryVT();
59807 if (MemVT == VT) {
59808 // If this is the only use, we can replace with a regular load (this may
59809 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59810 // memory chain).
59811 if (InVec.hasOneUse()) {
59812 SDValue Ld =
59813 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59814 MemIntr->getMemOperand());
59815 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59816 return Ld;
59817 }
59818 }
59819 }
59820
59821 // Attempt to extract from the source of a shuffle vector.
59822 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59823 SmallVector<int, 32> ShuffleMask;
59824 SmallVector<int, 32> ScaledMask;
59825 SmallVector<SDValue, 2> ShuffleInputs;
59826 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59827 // Decode the shuffle mask and scale it so its shuffling subvectors.
59828 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59829 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59830 unsigned SubVecIdx = IdxVal / NumSubElts;
59831 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59832 return DAG.getUNDEF(VT);
59833 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59834 return getZeroVector(VT, Subtarget, DAG, DL);
59835 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59836 if (Src.getValueSizeInBits() == InSizeInBits) {
59837 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59838 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59839 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59840 DL, SizeInBits);
59841 }
59842 }
59843 }
59844
59845 auto IsExtractFree = [](SDValue V) {
59846 if (V.hasOneUse()) {
59848 if (V.getOpcode() == ISD::LOAD)
59849 return true;
59850 }
59851 V = peekThroughBitcasts(V);
59852 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59853 return true;
59855 return true;
59856 return V.isUndef();
59857 };
59858
59859 // If we're extracting the lowest subvector and we're the only user,
59860 // we may be able to perform this with a smaller vector width.
59861 unsigned InOpcode = InVec.getOpcode();
59862 if (InVec.hasOneUse()) {
59863 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59864 // v2f64 CVTDQ2PD(v4i32).
59865 if (InOpcode == ISD::SINT_TO_FP &&
59866 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59867 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59868 }
59869 // v2f64 CVTUDQ2PD(v4i32).
59870 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59871 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59872 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59873 }
59874 // v2f64 CVTPS2PD(v4f32).
59875 if (InOpcode == ISD::FP_EXTEND &&
59876 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59877 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59878 }
59879 }
59880 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59881 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59882 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59883 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59884 Subtarget.hasVLX())) &&
59885 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59886 SDValue Src = InVec.getOperand(0);
59887 if (Src.getValueType().getScalarSizeInBits() == 32)
59888 return DAG.getNode(InOpcode, DL, VT,
59889 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59890 }
59891 if (IdxVal == 0 &&
59892 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59893 (SizeInBits == 128 || SizeInBits == 256) &&
59894 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59895 SDValue Ext = InVec.getOperand(0);
59896 if (Ext.getValueSizeInBits() > SizeInBits)
59897 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59898 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59899 return DAG.getNode(ExtOp, DL, VT, Ext);
59900 }
59901 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59902 InVec.getOperand(0).getValueType().is256BitVector() &&
59903 InVec.getOperand(1).getValueType().is256BitVector() &&
59904 InVec.getOperand(2).getValueType().is256BitVector()) {
59905 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59906 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59907 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59908 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59909 }
59910 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59911 (SizeInBits == 128 || SizeInBits == 256)) {
59912 SDValue InVecSrc = InVec.getOperand(0);
59913 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59914 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59915 return DAG.getNode(InOpcode, DL, VT, Ext);
59916 }
59917
59918 if (SizeInBits == 128 || SizeInBits == 256) {
59919 switch (InOpcode) {
59920 case X86ISD::MOVDDUP:
59921 return DAG.getNode(
59922 InOpcode, DL, VT,
59923 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59924 case X86ISD::PSHUFD:
59925 case X86ISD::VPERMILPI:
59926 if (InVec.getOperand(0).hasOneUse()) {
59927 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59928 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59929 return DAG.getNode(InOpcode, DL, VT,
59930 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59931 DL, SizeInBits),
59932 DAG.getTargetConstant(M, DL, MVT::i8));
59933 }
59934 break;
59935 case X86ISD::PCMPEQ:
59936 case X86ISD::PCMPGT:
59937 case X86ISD::UNPCKH:
59938 case X86ISD::UNPCKL:
59939 if (IsExtractFree(InVec.getOperand(0)) ||
59940 IsExtractFree(InVec.getOperand(1)))
59941 return DAG.getNode(InOpcode, DL, VT,
59942 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59943 DL, SizeInBits),
59944 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59945 DL, SizeInBits));
59946 break;
59947 case X86ISD::CMPP:
59948 if (IsExtractFree(InVec.getOperand(0)) ||
59949 IsExtractFree(InVec.getOperand(1)))
59950 return DAG.getNode(InOpcode, DL, VT,
59951 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59952 DL, SizeInBits),
59953 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59954 DL, SizeInBits),
59955 InVec.getOperand(2));
59956 break;
59957 case X86ISD::BLENDI:
59958 if (IsExtractFree(InVec.getOperand(0)) ||
59959 IsExtractFree(InVec.getOperand(1))) {
59960 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59961 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59962 return DAG.getNode(InOpcode, DL, VT,
59963 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59964 DL, SizeInBits),
59965 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59966 DL, SizeInBits),
59967 DAG.getTargetConstant(M, DL, MVT::i8));
59968 }
59969 break;
59970 case X86ISD::VPERMV:
59971 if (IdxVal != 0) {
59972 SDValue Mask = InVec.getOperand(0);
59973 SDValue Src = InVec.getOperand(1);
59974 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59975 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59976 DL, InSizeInBits);
59977 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59978 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59979 }
59980 break;
59981 case X86ISD::VPERMV3:
59982 if (IdxVal != 0) {
59983 SDValue Src0 = InVec.getOperand(0);
59984 SDValue Mask = InVec.getOperand(1);
59985 SDValue Src1 = InVec.getOperand(2);
59986 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59987 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59988 DL, InSizeInBits);
59989 SDValue Shuffle =
59990 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59991 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59992 }
59993 break;
59994 }
59995 }
59996 }
59997
59998 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59999 // as this is very likely to fold into a shuffle/truncation.
60000 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60001 InVecVT.getScalarSizeInBits() == 64 &&
60002 InVec.getConstantOperandAPInt(1) == 32) {
60003 SDValue Ext =
60004 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60005 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60006 }
60007
60008 return SDValue();
60009}
60010
60012 const X86Subtarget &Subtarget) {
60013 using namespace SDPatternMatch;
60014 EVT VT = N->getValueType(0);
60015 SDValue Src = N->getOperand(0);
60016 SDLoc DL(N);
60017
60018 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60019 // This occurs frequently in our masked scalar intrinsic code and our
60020 // floating point select lowering with AVX512.
60021 // TODO: SimplifyDemandedBits instead?
60022 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60023 isOneConstant(Src.getOperand(1)))
60024 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60025
60026 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60027 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60028 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60029 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60030 isNullConstant(Src.getOperand(1)))
60031 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60032 Src.getOperand(1));
60033
60034 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60035 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60036 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60037 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60038 if (Op.getValueType() != MVT::i64)
60039 return SDValue();
60040 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60041 if (Op.getOpcode() == Opc &&
60042 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60043 return Op.getOperand(0);
60044 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60045 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60046 if (Ld->getExtensionType() == Ext &&
60047 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60048 return Op;
60049 if (IsZeroExt) {
60050 KnownBits Known = DAG.computeKnownBits(Op);
60051 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60052 return Op;
60053 }
60054 return SDValue();
60055 };
60056
60057 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60058 return DAG.getBitcast(
60059 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60060 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60061
60062 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60063 return DAG.getBitcast(
60064 VT,
60065 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60066 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60067 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60068 }
60069
60070 if (Src.getOpcode() == ISD::BITCAST) {
60071 SDValue SrcOp = Src.getOperand(0);
60072 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60073 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60074 return DAG.getBitcast(
60075 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60076 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60077 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60078 return DAG.getBitcast(
60079 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60080 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60081 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60082 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60083 }
60084
60085 if (VT == MVT::v4i32) {
60086 SDValue HalfSrc;
60087 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60088 // to remove XMM->GPR->XMM moves.
60089 if (sd_match(Src, m_AnyExt(m_BitCast(
60090 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60091 return DAG.getBitcast(
60092 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60093 }
60094
60095 // See if we're broadcasting the scalar value, in which case just reuse that.
60096 // Ensure the same SDValue from the SDNode use is being used.
60097 if (VT.getScalarType() == Src.getValueType())
60098 for (SDNode *User : Src->users())
60099 if (User->getOpcode() == X86ISD::VBROADCAST &&
60100 Src == User->getOperand(0)) {
60101 unsigned SizeInBits = VT.getFixedSizeInBits();
60102 unsigned BroadcastSizeInBits =
60103 User->getValueSizeInBits(0).getFixedValue();
60104 if (BroadcastSizeInBits == SizeInBits)
60105 return SDValue(User, 0);
60106 if (BroadcastSizeInBits > SizeInBits)
60107 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60108 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60109 // coverage.
60110 }
60111
60112 // Check for cases where we've ended up with a scalarized shift, typically
60113 // during type legalization.
60114 switch (Src.getOpcode()) {
60115 case ISD::SHL:
60116 case ISD::SRL:
60117 case ISD::SRA:
60118 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60119 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60120 Src.hasOneUse()) {
60121 SDValue SrcVec =
60122 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60123 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60124 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60125 Amt->getZExtValue(), DAG);
60126 }
60127 }
60128 break;
60129 case ISD::FSHL:
60130 case ISD::FSHR:
60131 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60132 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60133 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60134 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60135 Src.hasOneUse()) {
60136 uint64_t AmtVal =
60137 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60138 SDValue SrcVec0 =
60139 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60140 SDValue SrcVec1 =
60141 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60142 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60143 DAG.getConstant(AmtVal, DL, VT));
60144 }
60145 }
60146 break;
60147 }
60148
60149 return SDValue();
60150}
60151
60152// Simplify PMULDQ and PMULUDQ operations.
60155 const X86Subtarget &Subtarget) {
60156 SDValue LHS = N->getOperand(0);
60157 SDValue RHS = N->getOperand(1);
60158
60159 // Canonicalize constant to RHS.
60162 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60163
60164 // Multiply by zero.
60165 // Don't return RHS as it may contain UNDEFs.
60166 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60167 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60168
60169 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60171 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60172 return SDValue(N, 0);
60173
60174 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60175 // convert it to any_extend_invec, due to the LegalOperations check, do the
60176 // conversion directly to a vector shuffle manually. This exposes combine
60177 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60178 // combineX86ShufflesRecursively on SSE4.1 targets.
60179 // FIXME: This is basically a hack around several other issues related to
60180 // ANY_EXTEND_VECTOR_INREG.
60181 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60182 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60183 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60184 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60185 SDLoc dl(N);
60186 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60187 LHS.getOperand(0), { 0, -1, 1, -1 });
60188 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60189 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60190 }
60191 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60192 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60193 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60194 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60195 SDLoc dl(N);
60196 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60197 RHS.getOperand(0), { 0, -1, 1, -1 });
60198 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60199 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60200 }
60201
60202 return SDValue();
60203}
60204
60205// Simplify VPMADDUBSW/VPMADDWD operations.
60208 MVT VT = N->getSimpleValueType(0);
60209 SDValue LHS = N->getOperand(0);
60210 SDValue RHS = N->getOperand(1);
60211 unsigned Opc = N->getOpcode();
60212 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60214 "Unexpected PMADD opcode");
60215
60216 // Multiply by zero.
60217 // Don't return LHS/RHS as it may contain UNDEFs.
60218 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60220 return DAG.getConstant(0, SDLoc(N), VT);
60221
60222 // Constant folding.
60223 APInt LHSUndefs, RHSUndefs;
60224 SmallVector<APInt> LHSBits, RHSBits;
60225 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60226 unsigned DstEltBits = VT.getScalarSizeInBits();
60227 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60228 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60229 SmallVector<APInt> Result;
60230 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60231 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60232 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60233 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60234 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60235 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60236 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60237 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60238 Result.push_back(Res);
60239 }
60240 return getConstVector(Result, VT, DAG, SDLoc(N));
60241 }
60242
60243 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60244 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60245 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60246 return SDValue(N, 0);
60247
60248 return SDValue();
60249}
60250
60251// Simplify VPMADD52L/VPMADD52H operations.
60254 MVT VT = N->getSimpleValueType(0);
60255
60256 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60257 SDValue Op0 = N->getOperand(0);
60258 SDValue Op1 = N->getOperand(1);
60259 SDValue Op2 = N->getOperand(2);
60260 SDLoc DL(N);
60261
60262 APInt C0, C1;
60263 bool HasC0 = X86::isConstantSplat(Op0, C0),
60264 HasC1 = X86::isConstantSplat(Op1, C1);
60265
60266 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60267 if (HasC0 && !HasC1)
60268 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60269
60270 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60271 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60272 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60273 if (KnownOp0.countMinLeadingZeros() >= 12)
60274 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60275 }
60276
60277 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60278 unsigned NumEltBits = VT.getScalarSizeInBits();
60279 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60280 DCI))
60281 return SDValue(N, 0);
60282
60283 return SDValue();
60284}
60285
60288 const X86Subtarget &Subtarget) {
60289 EVT VT = N->getValueType(0);
60290 SDValue In = N->getOperand(0);
60291 unsigned Opcode = N->getOpcode();
60292 unsigned InOpcode = In.getOpcode();
60293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60294 SDLoc DL(N);
60295
60296 // Try to merge vector loads and extend_inreg to an extload.
60297 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60298 In.hasOneUse()) {
60299 auto *Ld = cast<LoadSDNode>(In);
60300 if (Ld->isSimple()) {
60301 MVT SVT = In.getSimpleValueType().getVectorElementType();
60304 : ISD::ZEXTLOAD;
60305 EVT MemVT = VT.changeVectorElementType(SVT);
60306 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60307 SDValue Load = DAG.getExtLoad(
60308 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60309 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60310 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60311 return Load;
60312 }
60313 }
60314 }
60315
60316 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60317 if (Opcode == InOpcode)
60318 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60319
60320 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60321 // -> EXTEND_VECTOR_INREG(X).
60322 // TODO: Handle non-zero subvector indices.
60323 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60324 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60325 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60326 In.getValueSizeInBits())
60327 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60328
60329 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60330 // TODO: Move to DAGCombine?
60331 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60332 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60333 In.getValueSizeInBits() == VT.getSizeInBits()) {
60334 unsigned NumElts = VT.getVectorNumElements();
60335 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60336 EVT EltVT = In.getOperand(0).getValueType();
60337 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60338 for (unsigned I = 0; I != NumElts; ++I)
60339 Elts[I * Scale] = In.getOperand(I);
60340 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60341 }
60342
60343 // Attempt to combine as a shuffle on SSE41+ targets.
60344 if (Subtarget.hasSSE41()) {
60345 SDValue Op(N, 0);
60346 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60347 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60348 return Res;
60349 }
60350
60351 return SDValue();
60352}
60353
60356 EVT VT = N->getValueType(0);
60357 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60358 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60359 return DAG.getConstant(0, SDLoc(N), VT);
60360
60361 // Fold kshiftr(extract_subvector(X,C1),C2)
60362 // --> extract_subvector(kshiftr(X,C1+C2),0)
60363 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60364 if (N->getOpcode() == X86ISD::KSHIFTR) {
60365 SDLoc DL(N);
60366 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60367 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60368 SDValue Src = N->getOperand(0).getOperand(0);
60369 uint64_t Amt = N->getConstantOperandVal(1) +
60370 N->getOperand(0).getConstantOperandVal(1);
60371 EVT SrcVT = Src.getValueType();
60372 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60373 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60374 DAG.getTargetConstant(Amt, DL, MVT::i8));
60375 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60376 DAG.getVectorIdxConstant(0, DL));
60377 }
60378 }
60379 }
60380
60381 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60382 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60383 return SDValue(N, 0);
60384
60385 return SDValue();
60386}
60387
60388// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60389// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60390// extra instructions between the conversion due to going to scalar and back.
60392 const X86Subtarget &Subtarget) {
60393 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60394 return SDValue();
60395
60396 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60397 return SDValue();
60398
60399 if (N->getValueType(0) != MVT::f32 ||
60400 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60401 return SDValue();
60402
60403 SDLoc dl(N);
60404 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60405 N->getOperand(0).getOperand(0));
60406 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60407 DAG.getTargetConstant(4, dl, MVT::i32));
60408 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60409 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60410 DAG.getVectorIdxConstant(0, dl));
60411}
60412
60415 const X86Subtarget &Subtarget) {
60416 EVT VT = N->getValueType(0);
60417 bool IsStrict = N->isStrictFPOpcode();
60418 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60419 EVT SrcVT = Src.getValueType();
60420
60421 SDLoc dl(N);
60422 if (SrcVT.getScalarType() == MVT::bf16) {
60423 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60424 !IsStrict && Src.getOperand(0).getValueType() == VT)
60425 return Src.getOperand(0);
60426
60427 if (!SrcVT.isVector())
60428 return SDValue();
60429
60430 assert(!IsStrict && "Strict FP doesn't support BF16");
60431 if (VT.getVectorElementType() == MVT::f64) {
60432 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60433 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60434 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60435 }
60436 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60437 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60438 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60439 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60440 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60441 return DAG.getBitcast(VT, Src);
60442 }
60443
60444 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60445 return SDValue();
60446
60447 if (Subtarget.hasFP16())
60448 return SDValue();
60449
60450 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60451 return SDValue();
60452
60453 if (VT.getVectorElementType() != MVT::f32 &&
60454 VT.getVectorElementType() != MVT::f64)
60455 return SDValue();
60456
60457 unsigned NumElts = VT.getVectorNumElements();
60458 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60459 return SDValue();
60460
60461 // Convert the input to vXi16.
60462 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60463 Src = DAG.getBitcast(IntVT, Src);
60464
60465 // Widen to at least 8 input elements.
60466 if (NumElts < 8) {
60467 unsigned NumConcats = 8 / NumElts;
60468 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60469 : DAG.getConstant(0, dl, IntVT);
60470 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60471 Ops[0] = Src;
60472 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60473 }
60474
60475 // Destination is vXf32 with at least 4 elements.
60476 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60477 std::max(4U, NumElts));
60478 SDValue Cvt, Chain;
60479 if (IsStrict) {
60480 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60481 {N->getOperand(0), Src});
60482 Chain = Cvt.getValue(1);
60483 } else {
60484 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60485 }
60486
60487 if (NumElts < 4) {
60488 assert(NumElts == 2 && "Unexpected size");
60489 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60490 DAG.getVectorIdxConstant(0, dl));
60491 }
60492
60493 if (IsStrict) {
60494 // Extend to the original VT if necessary.
60495 if (Cvt.getValueType() != VT) {
60496 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60497 {Chain, Cvt});
60498 Chain = Cvt.getValue(1);
60499 }
60500 return DAG.getMergeValues({Cvt, Chain}, dl);
60501 }
60502
60503 // Extend to the original VT if necessary.
60504 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60505}
60506
60507// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60510 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60511 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60512 "Unknown broadcast load type");
60513
60514 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60515 SDValue Ptr = MemIntrin->getBasePtr();
60516 SDValue Chain = MemIntrin->getChain();
60517 EVT VT = N->getSimpleValueType(0);
60518 EVT MemVT = MemIntrin->getMemoryVT();
60519
60520 // Look at other users of our base pointer and try to find a wider broadcast.
60521 // The input chain and the size of the memory VT must match.
60522 for (SDNode *User : Ptr->users())
60523 if (User != N && User->getOpcode() == N->getOpcode() &&
60524 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60525 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60526 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60527 MemVT.getSizeInBits() &&
60528 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60530 MemIntrin->isSimple() && "Illegal broadcast load type");
60532 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60533 VT.getSizeInBits());
60534 Extract = DAG.getBitcast(VT, Extract);
60535 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60536 return Extract;
60537 }
60538
60539 return SDValue();
60540}
60541
60543 const X86Subtarget &Subtarget) {
60544 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60545 return SDValue();
60546
60547 bool IsStrict = N->isStrictFPOpcode();
60548 EVT VT = N->getValueType(0);
60549 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60550 EVT SrcVT = Src.getValueType();
60551
60552 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60553 SrcVT.getVectorElementType() != MVT::f32)
60554 return SDValue();
60555
60556 SDLoc dl(N);
60557
60558 SDValue Cvt, Chain;
60559 unsigned NumElts = VT.getVectorNumElements();
60560 if (Subtarget.hasFP16()) {
60561 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60562 // v4f32 (xint_to_fp v4i64))))
60563 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60564 // v8f16 (CVTXI2P v4i64)))
60565 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60566 Src.getNumOperands() == 2) {
60567 SDValue Cvt0, Cvt1;
60568 SDValue Op0 = Src.getOperand(0);
60569 SDValue Op1 = Src.getOperand(1);
60570 bool IsOp0Strict = Op0->isStrictFPOpcode();
60571 if (Op0.getOpcode() != Op1.getOpcode() ||
60572 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60573 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60574 return SDValue();
60575 }
60576 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60577 if (IsStrict) {
60578 assert(IsOp0Strict && "Op0 must be strict node");
60579 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60582 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60583 {Op0.getOperand(0), Op0.getOperand(1)});
60584 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60585 {Op1.getOperand(0), Op1.getOperand(1)});
60586 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60587 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60588 }
60589 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60591 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60592 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60593 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60594 }
60595 return SDValue();
60596 }
60597
60598 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60599 return SDValue();
60600
60601 // Widen to at least 4 input elements.
60602 if (NumElts < 4)
60603 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60604 DAG.getConstantFP(0.0, dl, SrcVT));
60605
60606 // Destination is v8i16 with at least 8 elements.
60607 EVT CvtVT =
60608 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60609 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60610 if (IsStrict) {
60611 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60612 {N->getOperand(0), Src, Rnd});
60613 Chain = Cvt.getValue(1);
60614 } else {
60615 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60616 }
60617
60618 // Extract down to real number of elements.
60619 if (NumElts < 8) {
60621 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60622 DAG.getVectorIdxConstant(0, dl));
60623 }
60624
60625 Cvt = DAG.getBitcast(VT, Cvt);
60626
60627 if (IsStrict)
60628 return DAG.getMergeValues({Cvt, Chain}, dl);
60629
60630 return Cvt;
60631}
60632
60634 SDValue Src = N->getOperand(0);
60635
60636 // Turn MOVDQ2Q+simple_load into an mmx load.
60637 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60638 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60639
60640 if (LN->isSimple()) {
60641 SDValue NewLd =
60642 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60643 LN->getPointerInfo(), LN->getBaseAlign(),
60644 LN->getMemOperand()->getFlags());
60645 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60646 return NewLd;
60647 }
60648 }
60649
60650 return SDValue();
60651}
60652
60655 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60657 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60658 return SDValue(N, 0);
60659
60660 return SDValue();
60661}
60662
60663// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60664// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60665// use x86mmx instead.
60667 SDLoc dl(N);
60668
60669 bool MadeChange = false, CastReturnVal = false;
60671 for (const SDValue &Arg : N->op_values()) {
60672 if (Arg.getValueType() == MVT::v1i64) {
60673 MadeChange = true;
60674 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60675 } else
60676 Args.push_back(Arg);
60677 }
60678 SDVTList VTs = N->getVTList();
60679 SDVTList NewVTs = VTs;
60680 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60681 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60682 NewVTArr[0] = MVT::x86mmx;
60683 NewVTs = DAG.getVTList(NewVTArr);
60684 MadeChange = true;
60685 CastReturnVal = true;
60686 }
60687
60688 if (MadeChange) {
60689 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60690 if (CastReturnVal) {
60692 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60693 Returns.push_back(Result.getValue(i));
60694 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60695 return DAG.getMergeValues(Returns, dl);
60696 }
60697 return Result;
60698 }
60699 return SDValue();
60700}
60703 if (!DCI.isBeforeLegalize())
60704 return SDValue();
60705
60706 unsigned IntNo = N->getConstantOperandVal(0);
60707 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60708
60709 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60710 return FixupMMXIntrinsicTypes(N, DAG);
60711
60712 return SDValue();
60713}
60714
60717 if (!DCI.isBeforeLegalize())
60718 return SDValue();
60719
60720 unsigned IntNo = N->getConstantOperandVal(1);
60721 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60722
60723 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60724 return FixupMMXIntrinsicTypes(N, DAG);
60725
60726 return SDValue();
60727}
60728
60731 if (!DCI.isBeforeLegalize())
60732 return SDValue();
60733
60734 unsigned IntNo = N->getConstantOperandVal(1);
60735 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60736
60737 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60738 return FixupMMXIntrinsicTypes(N, DAG);
60739
60740 return SDValue();
60741}
60742
60744 DAGCombinerInfo &DCI) const {
60745 SelectionDAG &DAG = DCI.DAG;
60746 switch (N->getOpcode()) {
60747 // clang-format off
60748 default: break;
60750 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60752 case X86ISD::PEXTRW:
60753 case X86ISD::PEXTRB:
60754 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60756 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60758 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60760 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60761 case ISD::VSELECT:
60762 case ISD::SELECT:
60763 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60764 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60765 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60766 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60767 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60768 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60769 case X86ISD::ADD:
60770 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60771 case X86ISD::CLOAD:
60772 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60773 case X86ISD::SBB: return combineSBB(N, DAG);
60774 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60775 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60776 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60777 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60778 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60779 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60780 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60781 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60782 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60783 case ISD::AVGCEILS:
60784 case ISD::AVGCEILU:
60785 case ISD::AVGFLOORS:
60786 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60787 case X86ISD::BEXTR:
60788 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60789 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60790 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60791 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60792 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60794 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60795 case ISD::SINT_TO_FP:
60797 return combineSIntToFP(N, DAG, DCI, Subtarget);
60798 case ISD::UINT_TO_FP:
60800 return combineUIntToFP(N, DAG, Subtarget);
60801 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60802 case ISD::LRINT:
60803 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60804 case ISD::FADD:
60805 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60806 case X86ISD::VFCMULC:
60807 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60808 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60809 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60810 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60811 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60812 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60813 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60814 case X86ISD::FXOR:
60815 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60816 case X86ISD::FMIN:
60817 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60818 case ISD::FMINNUM:
60819 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60820 case X86ISD::CVTSI2P:
60821 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60822 case X86ISD::CVTP2SI:
60823 case X86ISD::CVTP2UI:
60825 case X86ISD::CVTTP2SI:
60827 case X86ISD::CVTTP2UI:
60828 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60830 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60831 case X86ISD::BT: return combineBT(N, DAG, DCI);
60832 case ISD::ANY_EXTEND:
60833 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60834 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60835 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60839 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60840 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60841 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60842 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60843 case X86ISD::PACKSS:
60844 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60845 case X86ISD::HADD:
60846 case X86ISD::HSUB:
60847 case X86ISD::FHADD:
60848 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60849 case X86ISD::VSHL:
60850 case X86ISD::VSRA:
60851 case X86ISD::VSRL:
60852 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60853 case X86ISD::VSHLI:
60854 case X86ISD::VSRAI:
60855 case X86ISD::VSRLI:
60856 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60858 case X86ISD::PINSRB:
60859 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60860 case X86ISD::SHUFP: // Handle all target specific shuffles
60861 case X86ISD::INSERTPS:
60862 case X86ISD::EXTRQI:
60863 case X86ISD::INSERTQI:
60864 case X86ISD::VALIGN:
60865 case X86ISD::PALIGNR:
60866 case X86ISD::VSHLDQ:
60867 case X86ISD::VSRLDQ:
60868 case X86ISD::BLENDI:
60869 case X86ISD::UNPCKH:
60870 case X86ISD::UNPCKL:
60871 case X86ISD::MOVHLPS:
60872 case X86ISD::MOVLHPS:
60873 case X86ISD::PSHUFB:
60874 case X86ISD::PSHUFD:
60875 case X86ISD::PSHUFHW:
60876 case X86ISD::PSHUFLW:
60877 case X86ISD::MOVSHDUP:
60878 case X86ISD::MOVSLDUP:
60879 case X86ISD::MOVDDUP:
60880 case X86ISD::MOVSS:
60881 case X86ISD::MOVSD:
60882 case X86ISD::MOVSH:
60883 case X86ISD::VBROADCAST:
60884 case X86ISD::VPPERM:
60885 case X86ISD::VPERMI:
60886 case X86ISD::VPERMV:
60887 case X86ISD::VPERMV3:
60888 case X86ISD::VPERMIL2:
60889 case X86ISD::VPERMILPI:
60890 case X86ISD::VPERMILPV:
60891 case X86ISD::VPERM2X128:
60892 case X86ISD::SHUF128:
60893 case X86ISD::VZEXT_MOVL:
60894 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60895 case X86ISD::FMADD_RND:
60896 case X86ISD::FMSUB:
60898 case X86ISD::FMSUB_RND:
60899 case X86ISD::FNMADD:
60901 case X86ISD::FNMADD_RND:
60902 case X86ISD::FNMSUB:
60904 case X86ISD::FNMSUB_RND:
60905 case ISD::FMA:
60906 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60909 case X86ISD::FMADDSUB:
60910 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60911 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60912 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60913 case X86ISD::MGATHER:
60914 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60915 case ISD::MGATHER:
60916 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60917 case X86ISD::PCMPEQ:
60918 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60919 case X86ISD::PMULDQ:
60920 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60921 case X86ISD::VPMADDUBSW:
60922 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60923 case X86ISD::VPMADD52L:
60924 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60925 case X86ISD::KSHIFTL:
60926 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60927 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60929 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60931 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60933 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60934 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60935 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60936 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60937 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60938 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60940 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60941 // clang-format on
60942 }
60943
60944 return SDValue();
60945}
60946
60948 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60949}
60950
60951// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60953 EVT ExtVT) const {
60954 return Subtarget.hasAVX512() || !VT.isVector();
60955}
60956
60958 if (!isTypeLegal(VT))
60959 return false;
60960
60961 // There are no vXi8 shifts.
60962 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60963 return false;
60964
60965 // TODO: Almost no 8-bit ops are desirable because they have no actual
60966 // size/speed advantages vs. 32-bit ops, but they do have a major
60967 // potential disadvantage by causing partial register stalls.
60968 //
60969 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60970 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60971 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60972 // check for a constant operand to the multiply.
60973 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60974 return false;
60975
60976 // i16 instruction encodings are longer and some i16 instructions are slow,
60977 // so those are not desirable.
60978 if (VT == MVT::i16) {
60979 switch (Opc) {
60980 default:
60981 break;
60982 case ISD::LOAD:
60983 case ISD::SIGN_EXTEND:
60984 case ISD::ZERO_EXTEND:
60985 case ISD::ANY_EXTEND:
60986 case ISD::MUL:
60987 return false;
60988 case ISD::SHL:
60989 case ISD::SRA:
60990 case ISD::SRL:
60991 case ISD::SUB:
60992 case ISD::ADD:
60993 case ISD::AND:
60994 case ISD::OR:
60995 case ISD::XOR:
60996 // NDD instruction never has "partial register write" issue b/c it has
60997 // destination register's upper bits [63:OSIZE]) zeroed even when
60998 // OSIZE=8/16.
60999 return Subtarget.hasNDD();
61000 }
61001 }
61002
61003 // Any legal type not explicitly accounted for above here is desirable.
61004 return true;
61005}
61006
61008 SDValue Value, SDValue Addr,
61009 int JTI,
61010 SelectionDAG &DAG) const {
61011 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61012 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61013 if (IsCFProtectionSupported) {
61014 // In case control-flow branch protection is enabled, we need to add
61015 // notrack prefix to the indirect branch.
61016 // In order to do that we create NT_BRIND SDNode.
61017 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61018 SDValue Chain = Value;
61019 // Jump table debug info is only needed if CodeView is enabled.
61021 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61022 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61023 }
61024
61025 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61026}
61027
61030 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61032 EVT VT = LogicOp->getValueType(0);
61033 EVT OpVT = SETCC0->getOperand(0).getValueType();
61034 if (!VT.isInteger())
61036
61037 if (VT.isVector())
61042
61043 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61044 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61045 // `NotAnd` applies, `AddAnd` does as well.
61046 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61047 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61049}
61050
61052 EVT VT = Op.getValueType();
61053 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61054 isa<ConstantSDNode>(Op.getOperand(1));
61055
61056 // i16 is legal, but undesirable since i16 instruction encodings are longer
61057 // and some i16 instructions are slow.
61058 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61059 // using LEA and/or other ALU ops.
61060 if (VT != MVT::i16 && !Is8BitMulByConstant)
61061 return false;
61062
61063 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61064 if (!Op.hasOneUse())
61065 return false;
61066 SDNode *User = *Op->user_begin();
61068 return false;
61069 auto *Ld = cast<LoadSDNode>(Load);
61070 auto *St = cast<StoreSDNode>(User);
61071 return Ld->getBasePtr() == St->getBasePtr();
61072 };
61073
61074 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61075 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61076 return false;
61077 if (!Op.hasOneUse())
61078 return false;
61079 SDNode *User = *Op->user_begin();
61080 if (User->getOpcode() != ISD::ATOMIC_STORE)
61081 return false;
61082 auto *Ld = cast<AtomicSDNode>(Load);
61083 auto *St = cast<AtomicSDNode>(User);
61084 return Ld->getBasePtr() == St->getBasePtr();
61085 };
61086
61087 auto IsFoldableZext = [](SDValue Op) {
61088 if (!Op.hasOneUse())
61089 return false;
61090 SDNode *User = *Op->user_begin();
61091 EVT VT = User->getValueType(0);
61092 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61093 (VT == MVT::i32 || VT == MVT::i64));
61094 };
61095
61096 bool Commute = false;
61097 switch (Op.getOpcode()) {
61098 default: return false;
61099 case ISD::SIGN_EXTEND:
61100 case ISD::ZERO_EXTEND:
61101 case ISD::ANY_EXTEND:
61102 break;
61103 case ISD::SHL:
61104 case ISD::SRA:
61105 case ISD::SRL: {
61106 SDValue N0 = Op.getOperand(0);
61107 // Look out for (store (shl (load), x)).
61108 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61109 return false;
61110 break;
61111 }
61112 case ISD::MUL:
61113 // When ZU is enabled, we prefer to not promote for MUL by a constant
61114 // when there is an opportunity to fold a zext with imulzu.
61115 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61116 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61117 isa<ConstantSDNode>(Op.getOperand(1))))
61118 return false;
61119 [[fallthrough]];
61120 case ISD::ADD:
61121 case ISD::AND:
61122 case ISD::OR:
61123 case ISD::XOR:
61124 Commute = true;
61125 [[fallthrough]];
61126 case ISD::SUB: {
61127 SDValue N0 = Op.getOperand(0);
61128 SDValue N1 = Op.getOperand(1);
61129 // Avoid disabling potential load folding opportunities.
61130 if (X86::mayFoldLoad(N1, Subtarget) &&
61131 (!Commute || !isa<ConstantSDNode>(N0) ||
61132 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61133 return false;
61134 if (X86::mayFoldLoad(N0, Subtarget) &&
61135 ((Commute && !isa<ConstantSDNode>(N1)) ||
61136 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61137 return false;
61138 if (IsFoldableAtomicRMW(N0, Op) ||
61139 (Commute && IsFoldableAtomicRMW(N1, Op)))
61140 return false;
61141 }
61142 }
61143
61144 PVT = MVT::i32;
61145 return true;
61146}
61147
61148//===----------------------------------------------------------------------===//
61149// X86 Inline Assembly Support
61150//===----------------------------------------------------------------------===//
61151
61154 .Case("{@cca}", X86::COND_A)
61155 .Case("{@ccae}", X86::COND_AE)
61156 .Case("{@ccb}", X86::COND_B)
61157 .Case("{@ccbe}", X86::COND_BE)
61158 .Case("{@ccc}", X86::COND_B)
61159 .Case("{@cce}", X86::COND_E)
61160 .Case("{@ccz}", X86::COND_E)
61161 .Case("{@ccg}", X86::COND_G)
61162 .Case("{@ccge}", X86::COND_GE)
61163 .Case("{@ccl}", X86::COND_L)
61164 .Case("{@ccle}", X86::COND_LE)
61165 .Case("{@ccna}", X86::COND_BE)
61166 .Case("{@ccnae}", X86::COND_B)
61167 .Case("{@ccnb}", X86::COND_AE)
61168 .Case("{@ccnbe}", X86::COND_A)
61169 .Case("{@ccnc}", X86::COND_AE)
61170 .Case("{@ccne}", X86::COND_NE)
61171 .Case("{@ccnz}", X86::COND_NE)
61172 .Case("{@ccng}", X86::COND_LE)
61173 .Case("{@ccnge}", X86::COND_L)
61174 .Case("{@ccnl}", X86::COND_GE)
61175 .Case("{@ccnle}", X86::COND_G)
61176 .Case("{@ccno}", X86::COND_NO)
61177 .Case("{@ccnp}", X86::COND_NP)
61178 .Case("{@ccns}", X86::COND_NS)
61179 .Case("{@cco}", X86::COND_O)
61180 .Case("{@ccp}", X86::COND_P)
61181 .Case("{@ccs}", X86::COND_S)
61183 return Cond;
61184}
61185
61186/// Given a constraint letter, return the type of constraint for this target.
61189 if (Constraint.size() == 1) {
61190 switch (Constraint[0]) {
61191 case 'R':
61192 case 'q':
61193 case 'Q':
61194 case 'f':
61195 case 't':
61196 case 'u':
61197 case 'y':
61198 case 'x':
61199 case 'v':
61200 case 'l':
61201 case 'k': // AVX512 masking registers.
61202 return C_RegisterClass;
61203 case 'a':
61204 case 'b':
61205 case 'c':
61206 case 'd':
61207 case 'S':
61208 case 'D':
61209 case 'A':
61210 return C_Register;
61211 case 'I':
61212 case 'J':
61213 case 'K':
61214 case 'N':
61215 case 'G':
61216 case 'L':
61217 case 'M':
61218 return C_Immediate;
61219 case 'C':
61220 case 'e':
61221 case 'Z':
61222 return C_Other;
61223 default:
61224 break;
61225 }
61226 }
61227 else if (Constraint.size() == 2) {
61228 switch (Constraint[0]) {
61229 default:
61230 break;
61231 case 'W':
61232 if (Constraint[1] != 's')
61233 break;
61234 return C_Other;
61235 case 'Y':
61236 switch (Constraint[1]) {
61237 default:
61238 break;
61239 case 'z':
61240 return C_Register;
61241 case 'i':
61242 case 'm':
61243 case 'k':
61244 case 't':
61245 case '2':
61246 return C_RegisterClass;
61247 }
61248 break;
61249 case 'j':
61250 switch (Constraint[1]) {
61251 default:
61252 break;
61253 case 'r':
61254 case 'R':
61255 return C_RegisterClass;
61256 }
61257 }
61258 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61259 return C_Other;
61260 return TargetLowering::getConstraintType(Constraint);
61261}
61262
61263/// Examine constraint type and operand type and determine a weight value.
61264/// This object must already have been set up with the operand type
61265/// and the current alternative constraint selected.
61268 AsmOperandInfo &Info, const char *Constraint) const {
61270 Value *CallOperandVal = Info.CallOperandVal;
61271 // If we don't have a value, we can't do a match,
61272 // but allow it at the lowest weight.
61273 if (!CallOperandVal)
61274 return CW_Default;
61275 Type *Ty = CallOperandVal->getType();
61276 // Look at the constraint type.
61277 switch (*Constraint) {
61278 default:
61280 [[fallthrough]];
61281 case 'R':
61282 case 'q':
61283 case 'Q':
61284 case 'a':
61285 case 'b':
61286 case 'c':
61287 case 'd':
61288 case 'S':
61289 case 'D':
61290 case 'A':
61291 if (CallOperandVal->getType()->isIntegerTy())
61292 Wt = CW_SpecificReg;
61293 break;
61294 case 'f':
61295 case 't':
61296 case 'u':
61297 if (Ty->isFloatingPointTy())
61298 Wt = CW_SpecificReg;
61299 break;
61300 case 'y':
61301 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61302 Wt = CW_SpecificReg;
61303 break;
61304 case 'Y':
61305 if (StringRef(Constraint).size() != 2)
61306 break;
61307 switch (Constraint[1]) {
61308 default:
61309 return CW_Invalid;
61310 // XMM0
61311 case 'z':
61312 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61313 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61314 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61315 return CW_SpecificReg;
61316 return CW_Invalid;
61317 // Conditional OpMask regs (AVX512)
61318 case 'k':
61319 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61320 return CW_Register;
61321 return CW_Invalid;
61322 // Any MMX reg
61323 case 'm':
61324 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61325 return CW_SpecificReg;
61326 return CW_Invalid;
61327 // Any SSE reg when ISA >= SSE2, same as 'x'
61328 case 'i':
61329 case 't':
61330 case '2':
61331 if (!Subtarget.hasSSE2())
61332 return CW_Invalid;
61333 break;
61334 }
61335 break;
61336 case 'j':
61337 if (StringRef(Constraint).size() != 2)
61338 break;
61339 switch (Constraint[1]) {
61340 default:
61341 return CW_Invalid;
61342 case 'r':
61343 case 'R':
61344 if (CallOperandVal->getType()->isIntegerTy())
61345 Wt = CW_SpecificReg;
61346 break;
61347 }
61348 break;
61349 case 'v':
61350 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61351 Wt = CW_Register;
61352 [[fallthrough]];
61353 case 'x':
61354 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61355 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61356 Wt = CW_Register;
61357 break;
61358 case 'k':
61359 // Enable conditional vector operations using %k<#> registers.
61360 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61361 Wt = CW_Register;
61362 break;
61363 case 'I':
61364 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61365 if (C->getZExtValue() <= 31)
61366 Wt = CW_Constant;
61367 break;
61368 case 'J':
61369 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61370 if (C->getZExtValue() <= 63)
61371 Wt = CW_Constant;
61372 break;
61373 case 'K':
61374 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61375 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61376 Wt = CW_Constant;
61377 break;
61378 case 'L':
61379 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61380 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61381 Wt = CW_Constant;
61382 break;
61383 case 'M':
61384 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61385 if (C->getZExtValue() <= 3)
61386 Wt = CW_Constant;
61387 break;
61388 case 'N':
61389 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61390 if (C->getZExtValue() <= 0xff)
61391 Wt = CW_Constant;
61392 break;
61393 case 'G':
61394 case 'C':
61395 if (isa<ConstantFP>(CallOperandVal))
61396 Wt = CW_Constant;
61397 break;
61398 case 'e':
61399 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61400 if ((C->getSExtValue() >= -0x80000000LL) &&
61401 (C->getSExtValue() <= 0x7fffffffLL))
61402 Wt = CW_Constant;
61403 break;
61404 case 'Z':
61405 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61406 if (C->getZExtValue() <= 0xffffffff)
61407 Wt = CW_Constant;
61408 break;
61409 }
61410 return Wt;
61411}
61412
61413/// Try to replace an X constraint, which matches anything, with another that
61414/// has more specific requirements based on the type of the corresponding
61415/// operand.
61417LowerXConstraint(EVT ConstraintVT) const {
61418 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61419 // 'f' like normal targets.
61420 if (ConstraintVT.isFloatingPoint()) {
61421 if (Subtarget.hasSSE1())
61422 return "x";
61423 }
61424
61425 return TargetLowering::LowerXConstraint(ConstraintVT);
61426}
61427
61428// Lower @cc targets via setcc.
61430 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61431 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61432 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61433 if (Cond == X86::COND_INVALID)
61434 return SDValue();
61435 // Check that return type is valid.
61436 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61437 OpInfo.ConstraintVT.getSizeInBits() < 8)
61438 report_fatal_error("Glue output operand is of invalid type");
61439
61440 // Get EFLAGS register. Only update chain when copyfrom is glued.
61441 if (Glue.getNode()) {
61442 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61443 Chain = Glue.getValue(1);
61444 } else
61445 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61446 // Extract CC code.
61447 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61448 // Extend to 32-bits
61449 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61450
61451 return Result;
61452}
61453
61454/// Lower the specified operand into the Ops vector.
61455/// If it is invalid, don't add anything to Ops.
61457 StringRef Constraint,
61458 std::vector<SDValue> &Ops,
61459 SelectionDAG &DAG) const {
61460 SDValue Result;
61461 char ConstraintLetter = Constraint[0];
61462 switch (ConstraintLetter) {
61463 default: break;
61464 case 'I':
61465 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61466 if (C->getZExtValue() <= 31) {
61467 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61468 Op.getValueType());
61469 break;
61470 }
61471 }
61472 return;
61473 case 'J':
61474 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61475 if (C->getZExtValue() <= 63) {
61476 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61477 Op.getValueType());
61478 break;
61479 }
61480 }
61481 return;
61482 case 'K':
61483 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61484 if (isInt<8>(C->getSExtValue())) {
61485 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61486 Op.getValueType());
61487 break;
61488 }
61489 }
61490 return;
61491 case 'L':
61492 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61493 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61494 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61495 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61496 Op.getValueType());
61497 break;
61498 }
61499 }
61500 return;
61501 case 'M':
61502 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61503 if (C->getZExtValue() <= 3) {
61504 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61505 Op.getValueType());
61506 break;
61507 }
61508 }
61509 return;
61510 case 'N':
61511 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61512 if (C->getZExtValue() <= 255) {
61513 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61514 Op.getValueType());
61515 break;
61516 }
61517 }
61518 return;
61519 case 'O':
61520 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61521 if (C->getZExtValue() <= 127) {
61522 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61523 Op.getValueType());
61524 break;
61525 }
61526 }
61527 return;
61528 case 'e': {
61529 // 32-bit signed value
61530 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61532 C->getSExtValue())) {
61533 // Widen to 64 bits here to get it sign extended.
61534 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61535 break;
61536 }
61537 // FIXME gcc accepts some relocatable values here too, but only in certain
61538 // memory models; it's complicated.
61539 }
61540 return;
61541 }
61542 case 'W': {
61543 assert(Constraint[1] == 's');
61544 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61545 // offset.
61546 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61547 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61548 BA->getValueType(0)));
61549 } else {
61550 int64_t Offset = 0;
61551 if (Op->getOpcode() == ISD::ADD &&
61552 isa<ConstantSDNode>(Op->getOperand(1))) {
61553 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61554 Op = Op->getOperand(0);
61555 }
61556 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61557 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61558 GA->getValueType(0), Offset));
61559 }
61560 return;
61561 }
61562 case 'Z': {
61563 // 32-bit unsigned value
61564 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61566 C->getZExtValue())) {
61567 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61568 Op.getValueType());
61569 break;
61570 }
61571 }
61572 // FIXME gcc accepts some relocatable values here too, but only in certain
61573 // memory models; it's complicated.
61574 return;
61575 }
61576 case 'i': {
61577 // Literal immediates are always ok.
61578 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61579 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61580 BooleanContent BCont = getBooleanContents(MVT::i64);
61581 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61583 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61584 : CST->getSExtValue();
61585 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61586 break;
61587 }
61588
61589 // In any sort of PIC mode addresses need to be computed at runtime by
61590 // adding in a register or some sort of table lookup. These can't
61591 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61592 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61594 return;
61595
61596 // If we are in non-pic codegen mode, we allow the address of a global (with
61597 // an optional displacement) to be used with 'i'.
61598 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61599 // If we require an extra load to get this address, as in PIC mode, we
61600 // can't accept it.
61602 Subtarget.classifyGlobalReference(GA->getGlobal())))
61603 return;
61604 break;
61605 }
61606 }
61607
61608 if (Result.getNode()) {
61609 Ops.push_back(Result);
61610 return;
61611 }
61612 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61613}
61614
61615/// Check if \p RC is a general purpose register class.
61616/// I.e., GR* or one of their variant.
61617static bool isGRClass(const TargetRegisterClass &RC) {
61618 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61619 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61620 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61621 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61622 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61623}
61624
61625/// Check if \p RC is a vector register class.
61626/// I.e., FR* / VR* or one of their variant.
61627static bool isFRClass(const TargetRegisterClass &RC) {
61628 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61629 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61630 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61631 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61632 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61633 RC.hasSuperClassEq(&X86::VR512RegClass);
61634}
61635
61636/// Check if \p RC is a mask register class.
61637/// I.e., VK* or one of their variant.
61638static bool isVKClass(const TargetRegisterClass &RC) {
61639 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61640 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61641 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61642 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61643 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61644 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61645 RC.hasSuperClassEq(&X86::VK64RegClass);
61646}
61647
61648static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61649 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61650}
61651
61652std::pair<unsigned, const TargetRegisterClass *>
61654 StringRef Constraint,
61655 MVT VT) const {
61656 // First, see if this is a constraint that directly corresponds to an LLVM
61657 // register class.
61658 if (Constraint.size() == 1) {
61659 // GCC Constraint Letters
61660 switch (Constraint[0]) {
61661 default: break;
61662 // 'A' means [ER]AX + [ER]DX.
61663 case 'A':
61664 if (Subtarget.is64Bit())
61665 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61666 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61667 "Expecting 64, 32 or 16 bit subtarget");
61668 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61669
61670 // TODO: Slight differences here in allocation order and leaving
61671 // RIP in the class. Do they matter any more here than they do
61672 // in the normal allocation?
61673 case 'k':
61674 if (Subtarget.hasAVX512()) {
61675 if (VT == MVT::v1i1 || VT == MVT::i1)
61676 return std::make_pair(0U, &X86::VK1RegClass);
61677 if (VT == MVT::v8i1 || VT == MVT::i8)
61678 return std::make_pair(0U, &X86::VK8RegClass);
61679 if (VT == MVT::v16i1 || VT == MVT::i16)
61680 return std::make_pair(0U, &X86::VK16RegClass);
61681 }
61682 if (Subtarget.hasBWI()) {
61683 if (VT == MVT::v32i1 || VT == MVT::i32)
61684 return std::make_pair(0U, &X86::VK32RegClass);
61685 if (VT == MVT::v64i1 || VT == MVT::i64)
61686 return std::make_pair(0U, &X86::VK64RegClass);
61687 }
61688 break;
61689 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61690 if (Subtarget.is64Bit()) {
61691 if (VT == MVT::i8 || VT == MVT::i1)
61692 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61693 ? &X86::GR8RegClass
61694 : &X86::GR8_NOREX2RegClass);
61695 if (VT == MVT::i16)
61696 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61697 ? &X86::GR16RegClass
61698 : &X86::GR16_NOREX2RegClass);
61699 if (VT == MVT::i32 || VT == MVT::f32)
61700 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61701 ? &X86::GR32RegClass
61702 : &X86::GR32_NOREX2RegClass);
61703 if (VT != MVT::f80 && !VT.isVector())
61704 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61705 ? &X86::GR64RegClass
61706 : &X86::GR64_NOREX2RegClass);
61707 break;
61708 }
61709 [[fallthrough]];
61710 // 32-bit fallthrough
61711 case 'Q': // Q_REGS
61712 if (VT == MVT::i8 || VT == MVT::i1)
61713 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61714 if (VT == MVT::i16)
61715 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61716 if (VT == MVT::i32 || VT == MVT::f32 ||
61717 (!VT.isVector() && !Subtarget.is64Bit()))
61718 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61719 if (VT != MVT::f80 && !VT.isVector())
61720 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61721 break;
61722 case 'r': // GENERAL_REGS
61723 case 'l': // INDEX_REGS
61724 if (VT == MVT::i8 || VT == MVT::i1)
61725 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61726 ? &X86::GR8RegClass
61727 : &X86::GR8_NOREX2RegClass);
61728 if (VT == MVT::i16)
61729 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61730 ? &X86::GR16RegClass
61731 : &X86::GR16_NOREX2RegClass);
61732 if (VT == MVT::i32 || VT == MVT::f32 ||
61733 (!VT.isVector() && !Subtarget.is64Bit()))
61734 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61735 ? &X86::GR32RegClass
61736 : &X86::GR32_NOREX2RegClass);
61737 if (VT != MVT::f80 && !VT.isVector())
61738 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61739 ? &X86::GR64RegClass
61740 : &X86::GR64_NOREX2RegClass);
61741 break;
61742 case 'R': // LEGACY_REGS
61743 if (VT == MVT::i8 || VT == MVT::i1)
61744 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61745 if (VT == MVT::i16)
61746 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61747 if (VT == MVT::i32 || VT == MVT::f32 ||
61748 (!VT.isVector() && !Subtarget.is64Bit()))
61749 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61750 if (VT != MVT::f80 && !VT.isVector())
61751 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61752 break;
61753 case 'f': // FP Stack registers.
61754 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61755 // value to the correct fpstack register class.
61756 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61757 return std::make_pair(0U, &X86::RFP32RegClass);
61758 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61759 return std::make_pair(0U, &X86::RFP64RegClass);
61760 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61761 return std::make_pair(0U, &X86::RFP80RegClass);
61762 break;
61763 case 'y': // MMX_REGS if MMX allowed.
61764 if (!Subtarget.hasMMX()) break;
61765 return std::make_pair(0U, &X86::VR64RegClass);
61766 case 'v':
61767 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61768 if (!Subtarget.hasSSE1()) break;
61769 bool VConstraint = (Constraint[0] == 'v');
61770
61771 switch (VT.SimpleTy) {
61772 default: break;
61773 // Scalar SSE types.
61774 case MVT::f16:
61775 if (VConstraint && Subtarget.hasFP16())
61776 return std::make_pair(0U, &X86::FR16XRegClass);
61777 break;
61778 case MVT::f32:
61779 case MVT::i32:
61780 if (VConstraint && Subtarget.hasVLX())
61781 return std::make_pair(0U, &X86::FR32XRegClass);
61782 return std::make_pair(0U, &X86::FR32RegClass);
61783 case MVT::f64:
61784 case MVT::i64:
61785 if (VConstraint && Subtarget.hasVLX())
61786 return std::make_pair(0U, &X86::FR64XRegClass);
61787 return std::make_pair(0U, &X86::FR64RegClass);
61788 case MVT::i128:
61789 if (Subtarget.is64Bit()) {
61790 if (VConstraint && Subtarget.hasVLX())
61791 return std::make_pair(0U, &X86::VR128XRegClass);
61792 return std::make_pair(0U, &X86::VR128RegClass);
61793 }
61794 break;
61795 // Vector types and fp128.
61796 case MVT::v8f16:
61797 if (!Subtarget.hasFP16())
61798 break;
61799 if (VConstraint)
61800 return std::make_pair(0U, &X86::VR128XRegClass);
61801 return std::make_pair(0U, &X86::VR128RegClass);
61802 case MVT::v8bf16:
61803 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61804 break;
61805 if (VConstraint)
61806 return std::make_pair(0U, &X86::VR128XRegClass);
61807 return std::make_pair(0U, &X86::VR128RegClass);
61808 case MVT::f128:
61809 if (!Subtarget.is64Bit())
61810 break;
61811 [[fallthrough]];
61812 case MVT::v16i8:
61813 case MVT::v8i16:
61814 case MVT::v4i32:
61815 case MVT::v2i64:
61816 case MVT::v4f32:
61817 case MVT::v2f64:
61818 if (VConstraint && Subtarget.hasVLX())
61819 return std::make_pair(0U, &X86::VR128XRegClass);
61820 return std::make_pair(0U, &X86::VR128RegClass);
61821 // AVX types.
61822 case MVT::v16f16:
61823 if (!Subtarget.hasFP16())
61824 break;
61825 if (VConstraint)
61826 return std::make_pair(0U, &X86::VR256XRegClass);
61827 return std::make_pair(0U, &X86::VR256RegClass);
61828 case MVT::v16bf16:
61829 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61830 break;
61831 if (VConstraint)
61832 return std::make_pair(0U, &X86::VR256XRegClass);
61833 return std::make_pair(0U, &X86::VR256RegClass);
61834 case MVT::v32i8:
61835 case MVT::v16i16:
61836 case MVT::v8i32:
61837 case MVT::v4i64:
61838 case MVT::v8f32:
61839 case MVT::v4f64:
61840 if (VConstraint && Subtarget.hasVLX())
61841 return std::make_pair(0U, &X86::VR256XRegClass);
61842 if (Subtarget.hasAVX())
61843 return std::make_pair(0U, &X86::VR256RegClass);
61844 break;
61845 case MVT::v32f16:
61846 if (!Subtarget.hasFP16())
61847 break;
61848 if (VConstraint)
61849 return std::make_pair(0U, &X86::VR512RegClass);
61850 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61851 case MVT::v32bf16:
61852 if (!Subtarget.hasBF16())
61853 break;
61854 if (VConstraint)
61855 return std::make_pair(0U, &X86::VR512RegClass);
61856 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61857 case MVT::v64i8:
61858 case MVT::v32i16:
61859 case MVT::v8f64:
61860 case MVT::v16f32:
61861 case MVT::v16i32:
61862 case MVT::v8i64:
61863 if (!Subtarget.hasAVX512()) break;
61864 if (VConstraint)
61865 return std::make_pair(0U, &X86::VR512RegClass);
61866 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61867 }
61868 break;
61869 }
61870 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61871 switch (Constraint[1]) {
61872 default:
61873 break;
61874 case 'i':
61875 case 't':
61876 case '2':
61877 return getRegForInlineAsmConstraint(TRI, "x", VT);
61878 case 'm':
61879 if (!Subtarget.hasMMX()) break;
61880 return std::make_pair(0U, &X86::VR64RegClass);
61881 case 'z':
61882 if (!Subtarget.hasSSE1()) break;
61883 switch (VT.SimpleTy) {
61884 default: break;
61885 // Scalar SSE types.
61886 case MVT::f16:
61887 if (!Subtarget.hasFP16())
61888 break;
61889 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61890 case MVT::f32:
61891 case MVT::i32:
61892 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61893 case MVT::f64:
61894 case MVT::i64:
61895 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61896 case MVT::v8f16:
61897 if (!Subtarget.hasFP16())
61898 break;
61899 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61900 case MVT::v8bf16:
61901 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61902 break;
61903 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61904 case MVT::f128:
61905 case MVT::v16i8:
61906 case MVT::v8i16:
61907 case MVT::v4i32:
61908 case MVT::v2i64:
61909 case MVT::v4f32:
61910 case MVT::v2f64:
61911 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61912 // AVX types.
61913 case MVT::v16f16:
61914 if (!Subtarget.hasFP16())
61915 break;
61916 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61917 case MVT::v16bf16:
61918 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61919 break;
61920 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61921 case MVT::v32i8:
61922 case MVT::v16i16:
61923 case MVT::v8i32:
61924 case MVT::v4i64:
61925 case MVT::v8f32:
61926 case MVT::v4f64:
61927 if (Subtarget.hasAVX())
61928 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61929 break;
61930 case MVT::v32f16:
61931 if (!Subtarget.hasFP16())
61932 break;
61933 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61934 case MVT::v32bf16:
61935 if (!Subtarget.hasBF16())
61936 break;
61937 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61938 case MVT::v64i8:
61939 case MVT::v32i16:
61940 case MVT::v8f64:
61941 case MVT::v16f32:
61942 case MVT::v16i32:
61943 case MVT::v8i64:
61944 if (Subtarget.hasAVX512())
61945 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61946 break;
61947 }
61948 break;
61949 case 'k':
61950 // This register class doesn't allocate k0 for masked vector operation.
61951 if (Subtarget.hasAVX512()) {
61952 if (VT == MVT::v1i1 || VT == MVT::i1)
61953 return std::make_pair(0U, &X86::VK1WMRegClass);
61954 if (VT == MVT::v8i1 || VT == MVT::i8)
61955 return std::make_pair(0U, &X86::VK8WMRegClass);
61956 if (VT == MVT::v16i1 || VT == MVT::i16)
61957 return std::make_pair(0U, &X86::VK16WMRegClass);
61958 }
61959 if (Subtarget.hasBWI()) {
61960 if (VT == MVT::v32i1 || VT == MVT::i32)
61961 return std::make_pair(0U, &X86::VK32WMRegClass);
61962 if (VT == MVT::v64i1 || VT == MVT::i64)
61963 return std::make_pair(0U, &X86::VK64WMRegClass);
61964 }
61965 break;
61966 }
61967 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61968 switch (Constraint[1]) {
61969 default:
61970 break;
61971 case 'r':
61972 if (VT == MVT::i8 || VT == MVT::i1)
61973 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61974 if (VT == MVT::i16)
61975 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61976 if (VT == MVT::i32 || VT == MVT::f32)
61977 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61978 if (VT != MVT::f80 && !VT.isVector())
61979 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61980 break;
61981 case 'R':
61982 if (VT == MVT::i8 || VT == MVT::i1)
61983 return std::make_pair(0U, &X86::GR8RegClass);
61984 if (VT == MVT::i16)
61985 return std::make_pair(0U, &X86::GR16RegClass);
61986 if (VT == MVT::i32 || VT == MVT::f32)
61987 return std::make_pair(0U, &X86::GR32RegClass);
61988 if (VT != MVT::f80 && !VT.isVector())
61989 return std::make_pair(0U, &X86::GR64RegClass);
61990 break;
61991 }
61992 }
61993
61994 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61995 return std::make_pair(0U, &X86::GR32RegClass);
61996
61997 // Use the default implementation in TargetLowering to convert the register
61998 // constraint into a member of a register class.
61999 std::pair<Register, const TargetRegisterClass*> Res;
62001
62002 // Not found as a standard register?
62003 if (!Res.second) {
62004 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62005 // to/from f80.
62006 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62007 // Map st(0) -> st(7) -> ST0
62008 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62009 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62010 Constraint[3] == '(' &&
62011 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62012 Constraint[5] == ')' && Constraint[6] == '}') {
62013 // st(7) is not allocatable and thus not a member of RFP80. Return
62014 // singleton class in cases where we have a reference to it.
62015 if (Constraint[4] == '7')
62016 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62017 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62018 &X86::RFP80RegClass);
62019 }
62020
62021 // GCC allows "st(0)" to be called just plain "st".
62022 if (StringRef("{st}").equals_insensitive(Constraint))
62023 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62024 }
62025
62026 // flags -> EFLAGS
62027 if (StringRef("{flags}").equals_insensitive(Constraint))
62028 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62029
62030 // dirflag -> DF
62031 // Only allow for clobber.
62032 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62033 VT == MVT::Other)
62034 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62035
62036 // fpsr -> FPSW
62037 // Only allow for clobber.
62038 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62039 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62040
62041 return Res;
62042 }
62043
62044 // Make sure it isn't a register that requires 64-bit mode.
62045 if (!Subtarget.is64Bit() &&
62046 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62047 TRI->getEncodingValue(Res.first) >= 8) {
62048 // Register requires REX prefix, but we're in 32-bit mode.
62049 return std::make_pair(0, nullptr);
62050 }
62051
62052 // Make sure it isn't a register that requires AVX512.
62053 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62054 TRI->getEncodingValue(Res.first) & 0x10) {
62055 // Register requires EVEX prefix.
62056 return std::make_pair(0, nullptr);
62057 }
62058
62059 // Otherwise, check to see if this is a register class of the wrong value
62060 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62061 // turn into {ax},{dx}.
62062 // MVT::Other is used to specify clobber names.
62063 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62064 return Res; // Correct type already, nothing to do.
62065
62066 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62067 // return "eax". This should even work for things like getting 64bit integer
62068 // registers when given an f64 type.
62069 const TargetRegisterClass *Class = Res.second;
62070 // The generic code will match the first register class that contains the
62071 // given register. Thus, based on the ordering of the tablegened file,
62072 // the "plain" GR classes might not come first.
62073 // Therefore, use a helper method.
62074 if (isGRClass(*Class)) {
62075 unsigned Size = VT.getSizeInBits();
62076 if (Size == 1) Size = 8;
62077 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62078 return std::make_pair(0, nullptr);
62079 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62080 if (DestReg.isValid()) {
62081 bool is64Bit = Subtarget.is64Bit();
62082 const TargetRegisterClass *RC =
62083 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62084 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62085 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62086 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62087 if (Size == 64 && !is64Bit) {
62088 // Model GCC's behavior here and select a fixed pair of 32-bit
62089 // registers.
62090 switch (DestReg) {
62091 case X86::RAX:
62092 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62093 case X86::RDX:
62094 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62095 case X86::RCX:
62096 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62097 case X86::RBX:
62098 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62099 case X86::RSI:
62100 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62101 case X86::RDI:
62102 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62103 case X86::RBP:
62104 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62105 default:
62106 return std::make_pair(0, nullptr);
62107 }
62108 }
62109 if (RC && RC->contains(DestReg))
62110 return std::make_pair(DestReg, RC);
62111 return Res;
62112 }
62113 // No register found/type mismatch.
62114 return std::make_pair(0, nullptr);
62115 } else if (isFRClass(*Class)) {
62116 // Handle references to XMM physical registers that got mapped into the
62117 // wrong class. This can happen with constraints like {xmm0} where the
62118 // target independent register mapper will just pick the first match it can
62119 // find, ignoring the required type.
62120
62121 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62122 if (VT == MVT::f16)
62123 Res.second = &X86::FR16XRegClass;
62124 else if (VT == MVT::f32 || VT == MVT::i32)
62125 Res.second = &X86::FR32XRegClass;
62126 else if (VT == MVT::f64 || VT == MVT::i64)
62127 Res.second = &X86::FR64XRegClass;
62128 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62129 Res.second = &X86::VR128XRegClass;
62130 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62131 Res.second = &X86::VR256XRegClass;
62132 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62133 Res.second = &X86::VR512RegClass;
62134 else {
62135 // Type mismatch and not a clobber: Return an error;
62136 Res.first = 0;
62137 Res.second = nullptr;
62138 }
62139 } else if (isVKClass(*Class)) {
62140 if (VT == MVT::v1i1 || VT == MVT::i1)
62141 Res.second = &X86::VK1RegClass;
62142 else if (VT == MVT::v8i1 || VT == MVT::i8)
62143 Res.second = &X86::VK8RegClass;
62144 else if (VT == MVT::v16i1 || VT == MVT::i16)
62145 Res.second = &X86::VK16RegClass;
62146 else if (VT == MVT::v32i1 || VT == MVT::i32)
62147 Res.second = &X86::VK32RegClass;
62148 else if (VT == MVT::v64i1 || VT == MVT::i64)
62149 Res.second = &X86::VK64RegClass;
62150 else {
62151 // Type mismatch and not a clobber: Return an error;
62152 Res.first = 0;
62153 Res.second = nullptr;
62154 }
62155 }
62156
62157 return Res;
62158}
62159
62160bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62161 // Integer division on x86 is expensive. However, when aggressively optimizing
62162 // for code size, we prefer to use a div instruction, as it is usually smaller
62163 // than the alternative sequence.
62164 // The exception to this is vector division. Since x86 doesn't have vector
62165 // integer division, leaving the division as-is is a loss even in terms of
62166 // size, because it will have to be scalarized, while the alternative code
62167 // sequence can be performed in vector form.
62168 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62169 return OptSize && !VT.isVector();
62170}
62171
62172void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62173 if (!Subtarget.is64Bit())
62174 return;
62175
62176 // Update IsSplitCSR in X86MachineFunctionInfo.
62178 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62179 AFI->setIsSplitCSR(true);
62180}
62181
62182void X86TargetLowering::insertCopiesSplitCSR(
62183 MachineBasicBlock *Entry,
62184 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62185 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62186 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62187 if (!IStart)
62188 return;
62189
62190 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62191 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62192 MachineBasicBlock::iterator MBBI = Entry->begin();
62193 for (const MCPhysReg *I = IStart; *I; ++I) {
62194 const TargetRegisterClass *RC = nullptr;
62195 if (X86::GR64RegClass.contains(*I))
62196 RC = &X86::GR64RegClass;
62197 else
62198 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62199
62200 Register NewVR = MRI->createVirtualRegister(RC);
62201 // Create copy from CSR to a virtual register.
62202 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62203 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62204 // nounwind. If we want to generalize this later, we may need to emit
62205 // CFI pseudo-instructions.
62206 assert(
62207 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62208 "Function should be nounwind in insertCopiesSplitCSR!");
62209 Entry->addLiveIn(*I);
62210 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62211 .addReg(*I);
62212
62213 // Insert the copy-back instructions right before the terminator.
62214 for (auto *Exit : Exits)
62215 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62216 TII->get(TargetOpcode::COPY), *I)
62217 .addReg(NewVR);
62218 }
62219}
62220
62222 return Subtarget.is64Bit();
62223}
62224
62228 const TargetInstrInfo *TII) const {
62229 assert(MBBI->isCall() && MBBI->getCFIType() &&
62230 "Invalid call instruction for a KCFI check");
62231
62232 MachineFunction &MF = *MBB.getParent();
62233 // If the call target is a memory operand, unfold it and use R11 for the
62234 // call, so KCFI_CHECK won't have to recompute the address.
62235 switch (MBBI->getOpcode()) {
62236 case X86::CALL64m:
62237 case X86::CALL64m_NT:
62238 case X86::TAILJMPm64:
62239 case X86::TAILJMPm64_REX: {
62242 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62243 /*UnfoldStore=*/false, NewMIs))
62244 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62245 for (auto *NewMI : NewMIs)
62246 MBBI = MBB.insert(OrigCall, NewMI);
62247 assert(MBBI->isCall() &&
62248 "Unexpected instruction after memory operand unfolding");
62249 if (OrigCall->shouldUpdateAdditionalCallInfo())
62250 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62251 MBBI->setCFIType(MF, OrigCall->getCFIType());
62252 OrigCall->eraseFromParent();
62253 break;
62254 }
62255 default:
62256 break;
62257 }
62258
62259 MachineOperand &Target = MBBI->getOperand(0);
62260 Register TargetReg;
62261 switch (MBBI->getOpcode()) {
62262 case X86::CALL64r:
62263 case X86::CALL64r_ImpCall:
62264 case X86::CALL64r_NT:
62265 case X86::TAILJMPr64:
62266 case X86::TAILJMPr64_REX:
62267 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62268 Target.setIsRenamable(false);
62269 TargetReg = Target.getReg();
62270 break;
62271 case X86::CALL64pcrel32:
62272 case X86::TAILJMPd64:
62273 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62274 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62275 // 64-bit indirect thunk calls.
62276 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62277 "Unexpected register for an indirect thunk call");
62278 TargetReg = X86::R11;
62279 break;
62280 default:
62281 llvm_unreachable("Unexpected CFI call opcode");
62282 break;
62283 }
62284
62285 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62286 .addReg(TargetReg)
62287 .addImm(MBBI->getCFIType())
62288 .getInstr();
62289}
62290
62291/// Returns true if stack probing through a function call is requested.
62295
62296/// Returns true if stack probing through inline assembly is requested.
62298
62299 // No inline stack probe for Windows, they have their own mechanism.
62300 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62301 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62302 return false;
62303
62304 // If the function specifically requests inline stack probes, emit them.
62305 if (MF.getFunction().hasFnAttribute("probe-stack"))
62306 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62307 "inline-asm";
62308
62309 return false;
62310}
62311
62312/// Returns the name of the symbol used to emit stack probes or the empty
62313/// string if not applicable.
62316 // Inline Stack probes disable stack probe call
62317 if (hasInlineStackProbe(MF))
62318 return "";
62319
62320 // If the function specifically requests stack probes, emit them.
62321 if (MF.getFunction().hasFnAttribute("probe-stack"))
62322 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62323
62324 // Generally, if we aren't on Windows, the platform ABI does not include
62325 // support for stack probes, so don't emit them.
62326 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62327 Subtarget.isTargetMachO() ||
62328 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62329 return "";
62330
62331 // We need a stack probe to conform to the Windows ABI. Choose the right
62332 // symbol.
62333 if (Subtarget.is64Bit())
62334 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62335 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62336}
62337
62338unsigned
62340 // The default stack probe size is 4096 if the function has no stackprobesize
62341 // attribute.
62342 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62343 4096);
62344}
62345
62347 if (ML && ML->isInnermost() &&
62348 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62351}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.